diff --git a/.bazelrc b/.bazelrc
index 1014d1506c8d00..8c645260972dca 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -594,6 +594,12 @@ build:release_cpu_linux --config=avx_linux
 build:release_cpu_linux --crosstool_top="@ubuntu18.04-gcc7_manylinux2010-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain"
 test:release_cpu_linux --test_env=LD_LIBRARY_PATH
 
+# manylinux2014 config for cpu
+build:release_cpu_linux_manylinux2014 --config=release_base
+build:release_cpu_linux_manylinux2014 --config=avx_linux
+build:release_cpu_linux_manylinux2014 --crosstool_top="@ubuntu18.04-gcc8_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain"
+test:release_cpu_linux_manylinux2014 --test_env=LD_LIBRARY_PATH
+
 build:release_cpu_macos --config=release_base
 build:release_cpu_macos --config=avx_linux
 
@@ -616,6 +622,12 @@ build:release_gpu_linux_11_4 --action_env=TF_CUDA_VERSION="11.4"
 build:release_gpu_linux_11_4 --action_env=TF_CUDNN_VERSION="8.2"
 build:release_gpu_linux_11_4 --crosstool_top=@ubuntu18.04-gcc7_manylinux2010-cuda11.4-cudnn8.2-tensorrt7.2_config_cuda//crosstool:toolchain
 
+# manylinux2014 config for gpu
+build:release_gpu_linux_manylinux2014 --config=release_gpu_linux
+build:release_gpu_linux_manylinux2014 --action_env=GCC_HOST_COMPILER_PATH="/dt8/usr/bin/gcc"
+build:release_gpu_linux_manylinux2014 --crosstool_top=@ubuntu18.04-gcc8_manylinux2014-cuda11.2-cudnn8.1-tensorrt7.2_config_cuda//crosstool:toolchain
+
+
 build:release_cpu_windows --config=release_base
 build:release_cpu_windows --config=avx_win
 build:release_cpu_windows --define=no_tensorflow_py_deps=true
diff --git a/.bazelversion b/.bazelversion
index 0b2eb36f508590..fae6e3d04b2cab 100644
--- a/.bazelversion
+++ b/.bazelversion
@@ -1 +1 @@
-3.7.2
+4.2.1
diff --git a/ACKNOWLEDGMENTS b/ACKNOWLEDGMENTS
deleted file mode 100644
index 7eb20334c45cc7..00000000000000
--- a/ACKNOWLEDGMENTS
+++ /dev/null
@@ -1,50 +0,0 @@
-## Some of TensorFlow's code is derived from Caffe, which is subject to the following copyright notice:
-
-COPYRIGHT
-
-All contributions by the University of California:
-
-Copyright (c) 2014, The Regents of the University of California (Regents)
-All rights reserved.
-
-All other contributions:
-
-Copyright (c) 2014, the respective contributors
-All rights reserved.
-
-Caffe uses a shared copyright model: each contributor holds copyright over
-their contributions to Caffe. The project versioning records all such
-contribution and copyright details. If a contributor wants to further mark
-their specific copyright on a particular contribution, they should indicate
-their copyright solely in the commit message of the change when it is
-committed.
-
-LICENSE
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
-
-   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-   ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-   WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-   DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
-   ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-   (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-   LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-CONTRIBUTION AGREEMENT
-
-By contributing to the BVLC/caffe repository through pull-request, comment,
-or otherwise, the contributor releases their content to the
-license and copyright terms herein.
-
diff --git a/LICENSE b/LICENSE
index 9f6ace032ef128..12d255f8e0f049 100644
--- a/LICENSE
+++ b/LICENSE
@@ -200,31 +200,27 @@
    See the License for the specific language governing permissions and
    limitations under the License.
 
-------------------
-Files: third_party/compute_library/...
-
-MIT License
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-------------------
-Files: ACKNOWLEDGEMENTS
+## Some of TensorFlow's code is derived from Caffe, which is subject to the following copyright notice:
+
+COPYRIGHT
+
+All contributions by the University of California:
+
+Copyright (c) 2014, The Regents of the University of California (Regents)
+All rights reserved.
+
+All other contributions:
+
+Copyright (c) 2014, the respective contributors
+All rights reserved.
+
+Caffe uses a shared copyright model: each contributor holds copyright over
+their contributions to Caffe. The project versioning records all such
+contribution and copyright details. If a contributor wants to further mark
+their specific copyright on a particular contribution, they should indicate
+their copyright solely in the commit message of the change when it is
+committed.
+
 LICENSE
 
 Redistribution and use in source and binary forms, with or without
@@ -248,37 +244,8 @@ modification, are permitted provided that the following conditions are met:
    (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
    SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-------------------
-Files: third_party/hexagon
+CONTRIBUTION AGREEMENT
 
-Copyright (c) 2016-2019, The Linux Foundation. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted (subject to the limitations in the
-disclaimer below) provided that the following conditions are met:
-
-   * Redistributions of source code must retain the above copyright
-     notice, this list of conditions and the following disclaimer.
-
-   * Redistributions in binary form must reproduce the above
-     copyright notice, this list of conditions and the following
-     disclaimer in the documentation and/or other materials provided
-     with the distribution.
-
-   * Neither the name of The Linux Foundation nor the names of its
-     contributors may be used to endorse or promote products derived
-     from this software without specific prior written permission.
-
-NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
-GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
-HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
-WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
-IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
-ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
-GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
-INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
-IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
-OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
-IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+By contributing to the BVLC/caffe repository through pull-request, comment,
+or otherwise, the contributor releases their content to the
+license and copyright terms herein.
\ No newline at end of file
diff --git a/RELEASE.md b/RELEASE.md
index 81ab910333f592..8f825e8263de48 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -16,9 +16,12 @@
 # Major Features and Improvements
 
 *   `tf.lite`:
-    *   Where operation support is added for these data types
-        'int32/uint32/int8/uint8/int64'
-    *   Add builtin support for `Bucketize` op on CPU.
+    *   Added TFLite builtin op support for the following TF ops:
+        *  `tf.raw_ops.Bucketize` op on CPU.
+        *  `tf.where` op for data types `tf.int32`/`tf.uint32`/`tf.int8`/`tf.uint8`/`tf.int64`.
+        *  `tf.random.normal` op for output data type `tf.float32` on CPU.
+        *  `tf.random.uniform` op for output data type `tf.float32` on CPU.
+        *  `tf.random.categorical` op for output data type `tf.int64` on CPU.
 *   `tensorflow.experimental.tensorrt`:
 
     *   `conversion_params` is now deprecated inside `TrtGraphConverterV2` in
@@ -29,6 +32,16 @@
         `.save()` function inside `TrtGraphConverterV2`. When `False`, the
         `.save()` function won't save any TRT engines that have been built. When
         `True` (default), the original behavior is preserved.
+*   `tf.tpu.experimental.embedding`:
+    *   `tf.tpu.experimental.embedding.FeatureConfig` now takes an additional
+        argument `output_shape` which can specify the shape of the output
+        activation for the feature.
+    *   `tf.tpu.experimental.embedding.TPUEmbedding` now has the same behavior
+        as `tf.tpu.experimental.embedding.serving_embedding_lookup` which can
+        take arbitrary rank of dense and sparse tensor. For ragged tensor,
+        though the input tensor remains to be rank 2, the activations now can be
+        rank 2 or above by specifying the output shape in the feature config
+        or via the build method.
 
 *   <INSERT MAJOR FEATURE HERE, USING MARKDOWN SYNTAX>
 
@@ -42,6 +55,9 @@
 * `tf.data`:
   * The optimization `parallel_batch` now becomes default if not disabled by
     users, which will parallelize copying of batch elements.
+  * Added the ability for `TensorSliceDataset` to identify and handle inputs
+    that are files. This enables creating hermetic SavedModels when using
+    datasets created from files.
 
 * `tf.lite`:
   * GPU
@@ -161,6 +177,7 @@ This release contains contributions from many people at Google, as well as:
 * `tf.lite`:
   * Add experimental API `experimental_from_jax` to support conversion from Jax models to TensorFlow Lite.
   * Support uint32 data type for cast op.
+  * Support int8 data type for cast op.
   * Add experimental quantization debugger `tf.lite.QuantizationDebugger`
   * Add lite.experimental.authoring.compatible API
       *   A Python decorator to provide a way to check TFLite compatibility
diff --git a/configure.py b/configure.py
index bff9abfe154797..6cdd109783e0e8 100644
--- a/configure.py
+++ b/configure.py
@@ -45,7 +45,7 @@
 _TF_WORKSPACE_ROOT = ''
 _TF_BAZELRC = ''
 _TF_CURRENT_BAZEL_VERSION = None
-_TF_MIN_BAZEL_VERSION = '3.7.2'
+_TF_MIN_BAZEL_VERSION = '4.2.1'
 _TF_MAX_BAZEL_VERSION = '4.99.0'
 
 NCCL_LIB_PATHS = [
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 216e8ed0cd01d9..27c7919912d9d0 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -46,7 +46,6 @@ licenses(["notice"])
 
 exports_files([
     "LICENSE",
-    "ACKNOWLEDGMENTS",
     # The leakr files are used by //third_party/cloud_tpu and
     # //third_party/tensorboard/google:copybara_config_test.
     "leakr_badwords.dic",
diff --git a/tensorflow/c/eager/abstract_context.h b/tensorflow/c/eager/abstract_context.h
index 07a78f97bd5a9f..2132daf2cfa388 100644
--- a/tensorflow/c/eager/abstract_context.h
+++ b/tensorflow/c/eager/abstract_context.h
@@ -42,7 +42,7 @@ class AbstractContext {
   // Release any underlying resources, including the interface object.
   //
   // WARNING: The destructor of this class is marked as protected to disallow
-  // clients from directly destroying this object since it may manage it's own
+  // clients from directly destroying this object since it may manage its own
   // lifetime through ref counting. Thus clients MUST call Release() in order to
   // destroy an instance of this class.
   virtual void Release() = 0;
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 0afb69bb82ce79..7ad77587d6fe70 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -119,7 +119,7 @@ TF_CAPI_EXPORT extern TFE_ContextDevicePlacementPolicy
 TFE_ContextGetDevicePlacementPolicy(TFE_Context* ctx);
 
 // A tensorflow.ServerDef specifies remote workers (in addition to the current
-// workers name). Operations created on this context can then be executed on
+// workers name). Operations created in this context can then be executed on
 // any of these remote workers by setting an appropriate device.
 //
 // If the following is set, all servers identified by the
@@ -134,7 +134,7 @@ TF_CAPI_EXPORT extern void TFE_ContextSetServerDef(TFE_Context* ctx,
 //
 // Like a TF_Tensor, a TFE_TensorHandle refers to a tensor with a value, shape,
 // type etc. Unlike a TF_Tensor, a TFE_TensorHandle may refer to such tensors
-// placed in memory of different devices or remote address spaces.
+// placed in the memory of different devices or remote address spaces.
 typedef struct TFE_TensorHandle TFE_TensorHandle;
 
 TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandle(const TF_Tensor* t,
@@ -442,7 +442,7 @@ TF_CAPI_EXPORT extern void TFE_ContextStartStep(TFE_Context* ctx);
 
 // Ends a step. When there is no active step (that is, every started step has
 // been ended) step containers will be cleared. Note: it is not safe to call
-// TFE_ContextEndStep while ops which rely on the step container may be running.
+// TFE_ContextEndStep while ops that rely on the step container may be running.
 TF_CAPI_EXPORT extern void TFE_ContextEndStep(TFE_Context* ctx);
 
 #ifdef __cplusplus
diff --git a/tensorflow/c/eager/c_api_distributed_test.cc b/tensorflow/c/eager/c_api_distributed_test.cc
index d21cadfd0cbcdc..208ce427478b72 100644
--- a/tensorflow/c/eager/c_api_distributed_test.cc
+++ b/tensorflow/c/eager/c_api_distributed_test.cc
@@ -161,7 +161,7 @@ void TestFunctionWithPackedInput(const bool remote) {
   TFE_TensorHandle* h1 = TestVariable(ctx, 2.0, task2_name);
   TFE_TensorHandle* h2 = TestVariable(ctx, 3.0, task0_name);
 
-  // Add a sync point in order to make sure that variables have been initialized
+  // Add a sync point to make sure that variables have been initialized
   // before the function execution starts.
   TFE_ContextAsyncWait(ctx, status);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index f976b4b876c851..ee9cf9f950fd5e 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -140,7 +140,7 @@ TFE_MonitoringGetCellIntGauge2(TFE_MonitoringIntGauge2* gauge,
 typedef struct TFE_MonitoringStringGaugeCell TFE_MonitoringStringGaugeCell;
 TF_CAPI_EXPORT extern void TFE_MonitoringStringGaugeCellSet(
     TFE_MonitoringStringGaugeCell* cell, const char* value);
-// Retrieves the string value and saves it in buffer.
+// Retrieves the string value and saves it in the buffer.
 TF_CAPI_EXPORT extern const void TFE_MonitoringStringGaugeCellValue(
     TFE_MonitoringStringGaugeCell* cell, TF_Buffer* buf);
 
@@ -248,7 +248,7 @@ TF_CAPI_EXPORT extern void TFE_MonitoringSamplerCellAdd(
     TFE_MonitoringSamplerCell* cell, double value);
 
 // Retrieves the current value of the cell. The return value is a HistogramProto
-// saved in buffer.
+// saved in the buffer.
 TF_CAPI_EXPORT extern void TFE_MonitoringSamplerCellValue(
     TFE_MonitoringSamplerCell* cell, TF_Buffer* buf);
 
@@ -353,7 +353,7 @@ TF_CAPI_EXPORT extern bool TFE_ExecutorIsAsync(TFE_Executor*);
 TF_CAPI_EXPORT extern void TFE_ExecutorWaitForAllPendingNodes(
     TFE_Executor*, TF_Status* status);
 
-// When an error happens, any pending operations are discarded and newly issued
+// When an error happens, any pending operations are discarded, and newly issued
 // ops return an error. This call clears the error state and re-enables
 // execution of newly issued ops.
 //
@@ -362,12 +362,12 @@ TF_CAPI_EXPORT extern void TFE_ExecutorWaitForAllPendingNodes(
 // TODO(agarwal): mark the affected handles and raise errors if they are used.
 TF_CAPI_EXPORT extern void TFE_ExecutorClearError(TFE_Executor*);
 
-// Sets a custom Executor for current thread. All nodes created by this thread
-// will be added to this Executor. It will override current executor.
+// Sets a custom Executor for the current thread. All nodes created by this
+// thread will be added to this Executor. It will override the current executor.
 TF_CAPI_EXPORT extern void TFE_ContextSetExecutorForThread(TFE_Context*,
                                                            TFE_Executor*);
 
-// Returns the Executor for current thread.
+// Returns the Executor for the current thread.
 TF_CAPI_EXPORT extern TFE_Executor* TFE_ContextGetExecutorForThread(
     TFE_Context*);
 
@@ -376,7 +376,7 @@ TF_CAPI_EXPORT extern TFE_Executor* TFE_ContextGetExecutorForThread(
 
 // Update an existing context with a new set of servers defined in a ServerDef
 // proto. Servers can be added to and removed from the list of remote workers
-// in the context. New set of servers identified by the ServerDef must be up
+// in the context. A New set of servers identified by the ServerDef must be up
 // when the context is updated.
 //
 // This API is for experimental usage and may be subject to change.
@@ -527,8 +527,8 @@ typedef struct TFE_CustomDevice {
 // names of wrapped devices.
 //
 // There are currently no graph semantics implemented for registered custom
-// devices, so executing tf.functions which contain operations placed on custom
-// devices will fail.
+// devices, so executing tf.functions which contain operations placed on the
+// custom devices will fail.
 //
 // `device_name` must not name an existing physical or custom device. It must
 // follow the format:
@@ -646,8 +646,8 @@ TF_CAPI_EXPORT extern int TFE_TensorHandleDeviceID(TFE_TensorHandle* h,
                                                    TF_Status* status);
 
 // Returns the status for the tensor handle. In TFRT, a tensor handle can carry
-// error info if error happens. If so, status will be set with the error info.
-// If not, status will be set as OK.
+// error info if error happens. If so, the status will be set with the error
+// info. If not, status will be set as OK.
 TF_CAPI_EXPORT extern void TFE_TensorHandleGetStatus(TFE_TensorHandle* h,
                                                      TF_Status* status);
 
@@ -673,7 +673,7 @@ TF_CAPI_EXPORT extern void TFE_SetLogicalCpuDevices(TFE_Context* ctx,
 // setting the same key will lead to errors.
 //
 // Note that the key-values are only expected to be used for cluster
-// configuration data, and should not be used for storing large amount of data
+// configuration data, and should not be used for storing a large amount of data
 // or being accessed very frequently.
 TF_CAPI_EXPORT extern void TFE_InsertConfigKeyValue(TFE_Context* ctx,
                                                     const char* key,
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index beaca6c4ffd22f..e3a038489ff270 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -2174,92 +2174,249 @@ TEST(CAPI, ShareVariableAcrossContextsWorks) {
   worker_server2.release();
 }
 
+void ReplaceTaskInServerDef(tensorflow::ServerDef* server_def, int task_index,
+                            const string& host, int port) {
+  tensorflow::JobDef* job_def = server_def->mutable_cluster()->mutable_job(0);
+  job_def->mutable_tasks()->at(task_index) =
+      tensorflow::strings::StrCat(host, ":", port);
+}
+
+TEST(CAPI, ShareVariableAcrossContextsAfterUpdateContextWorks) {
+  tensorflow::ServerDef server_def_0 = GetServerDef(3);
+  server_def_0.mutable_default_session_config()->set_isolate_session_state(
+      false);
+  tensorflow::ServerDef server_def_1 =
+      ReplaceTaskInServerDef(server_def_0, /*task_index=*/0);
+
+  // These server defs have task index set to 0.
+  string serialized_server_def_0 = server_def_0.SerializeAsString();
+  string serialized_server_def_1 = server_def_1.SerializeAsString();
+
+  // Create two worker tasks.
+  server_def_0.set_task_index(1);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server1;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def_0, tensorflow::Env::Default(), &worker_server1)
+                  .ok());
+  ASSERT_TRUE(worker_server1->Start().ok());
+  server_def_0.set_task_index(2);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server2;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def_0, tensorflow::Env::Default(), &worker_server2)
+                  .ok());
+  ASSERT_TRUE(worker_server2->Start().ok());
+
+  // Create two contexts.
+  TFE_Context* ctx_0 = CreateContext(serialized_server_def_0,
+                                     /*isolate_session_state=*/false);
+  TFE_Context* ctx_1 = CreateContext(serialized_server_def_1,
+                                     /*isolate_session_state=*/false);
+
+  // Remote device on `worker2`.
+  const char remote_device[] = "/job:localhost/replica:0/task:2/device:CPU:0";
+  // `ctx_0`, `ctx_1` contains `remote_device`.
+  {
+    const std::vector<std::string>& device_names = ListDeviceNames(ctx_0);
+    ASSERT_TRUE(std::find(device_names.begin(), device_names.end(),
+                          remote_device) != device_names.end());
+  }
+
+  {
+    const std::vector<std::string>& device_names = ListDeviceNames(ctx_1);
+    ASSERT_TRUE(std::find(device_names.begin(), device_names.end(),
+                          remote_device) != device_names.end());
+  }
+
+  // Create a variable using `ctx_0`.
+  // Replace worker1 using a new worker, and update the contexts.
+  // Read the variable using `ctx_1`. This read should succeed.
+  //
+  // 1. Create a variable on `remote_device`, using `ctx_0`.
+  TFE_TensorHandle* handle_0 =
+      CreateVariable(ctx_0, 1.2, remote_device, /*variable_name=*/"var");
+
+  // 2. Wait for `var` to be created and initialized on the worker.
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextAsyncWait(ctx_0, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
+
+  int port = tensorflow::testing::PickUnusedPortOrDie();
+  // 3. Replace worker1 with a new worker in server_def_0 and server_def_1.
+  ReplaceTaskInServerDef(&server_def_0, /*task_index=*/1, "localhost", port);
+  ReplaceTaskInServerDef(&server_def_1, /*task_index=*/1, "localhost", port);
+  // 4. Start a new task to replace worker1.
+  server_def_0.set_task_index(1);
+  worker_server1.release();
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def_0, tensorflow::Env::Default(), &worker_server1)
+                  .ok());
+  ASSERT_TRUE(worker_server1->Start().ok());
+
+  // 5a. Update `ctx_0` with updated `server_def_0`.
+  {
+    server_def_0.set_task_index(0);
+    string serialized_update = server_def_0.SerializeAsString();
+    TF_Status* status = TF_NewStatus();
+    TFE_ContextUpdateServerDef(ctx_0, 0, serialized_update.data(),
+                               serialized_update.size(), status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TF_DeleteStatus(status);
+  }
+
+  // 5b. Update `ctx_1` with updated `server_def_1`.
+  {
+    server_def_1.set_task_index(0);
+    string serialized_update = server_def_1.SerializeAsString();
+    TF_Status* status = TF_NewStatus();
+    TFE_ContextUpdateServerDef(ctx_1, 0, serialized_update.data(),
+                               serialized_update.size(), status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TF_DeleteStatus(status);
+  }
+
+  // 6. Read `var` using `ctx_1`. This read should succeed since `ctx_1` was
+  // created with `isolate_session_state` set to false, and update should
+  // preserve it.
+  {
+    // Create a handle to `var`, using `ctx_1`.
+    TFE_TensorHandle* var_handle =
+        CreateVarHandle(ctx_1, remote_device, /*variable_name=*/"var");
+
+    TFE_TensorHandle* handle_1 = nullptr;
+    int num_retvals = 1;
+    TF_Status* status = TF_NewStatus();
+    TFE_Op* op = TFE_NewOp(ctx_1, "ReadVariableOp", status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_OpSetAttrType(op, "dtype", TF_FLOAT);
+    TFE_OpAddInput(op, var_handle, status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_Execute(op, &handle_1, &num_retvals, status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_DeleteOp(op);
+
+    ASSERT_EQ(1, num_retvals);
+    EXPECT_EQ(TF_FLOAT, TFE_TensorHandleDataType(handle_1));
+    EXPECT_EQ(0, TFE_TensorHandleNumDims(handle_1, status));
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+    // Read the value of tensor handle `handle_1`.
+    float value = 0.0f;
+    TF_Tensor* t = TFE_TensorHandleResolve(handle_1, status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    ASSERT_EQ(sizeof(float), TF_TensorByteSize(t));
+    memcpy(&value, TF_TensorData(t), sizeof(float));
+    TF_DeleteTensor(t);
+    EXPECT_EQ(1.2f, value);
+    TFE_DeleteTensorHandle(handle_1);
+    TF_DeleteStatus(status);
+    TFE_DeleteTensorHandle(var_handle);
+  }
+
+  TFE_DeleteTensorHandle(handle_0);
+
+  TFE_DeleteContext(ctx_0);
+  TFE_DeleteContext(ctx_1);
+
+  worker_server1.release();
+  worker_server2.release();
+}
+
 tensorflow::ServerDef CreateSingleHostServerDef(
     const tensorflow::ServerDef& cluster_server_def, int task_index) {
   tensorflow::ServerDef single_host_server_def;
-  single_host_server_def.set_job_name(cluster_server_def.job_name());
+  single_host_server_def.set_job_name("worker");
   single_host_server_def.set_protocol(cluster_server_def.protocol());
   single_host_server_def.set_task_index(0);
   tensorflow::ClusterDef* cluster_def =
       single_host_server_def.mutable_cluster();
   tensorflow::JobDef* job_def = cluster_def->add_job();
-  job_def->set_name(cluster_server_def.job_name());
+  job_def->set_name("client");
 
   // Add a client.
-  single_host_server_def.mutable_cluster()
-      ->mutable_job(0)
-      ->mutable_tasks()
-      ->insert(
-          {0, tensorflow::strings::StrCat(
-                  "localhost:", tensorflow::testing::PickUnusedPortOrDie())});
+  job_def->mutable_tasks()->insert(
+      {0, tensorflow::strings::StrCat(
+              "localhost:", tensorflow::testing::PickUnusedPortOrDie())});
+
+  tensorflow::JobDef* job_def2 = cluster_def->add_job();
+  job_def2->set_name("worker");
 
   // Copy over `host:port` at `task_index`
   for (auto task : cluster_server_def.cluster().job(0).tasks()) {
     if (task.first == task_index) {
-      single_host_server_def.mutable_cluster()
-          ->mutable_job(0)
-          ->mutable_tasks()
-          ->insert({task.first, task.second});
+      job_def2->mutable_tasks()->insert({task.first, task.second});
     }
   }
 
   return single_host_server_def;
 }
 
+tensorflow::ServerDef GetClusterServerDef(const string& worker_job_name,
+                                          int num_workers) {
+  tensorflow::ServerDef server_def = GetServerDef(worker_job_name, num_workers);
+  tensorflow::ClusterDef* cluster_def = server_def.mutable_cluster();
+
+  // Add a client.
+  tensorflow::JobDef* job_def2 = cluster_def->add_job();
+  job_def2->set_name("client");
+  job_def2->mutable_tasks()->insert(
+      {0, tensorflow::strings::StrCat(
+              "localhost:", tensorflow::testing::PickUnusedPortOrDie())});
+  return server_def;
+}
+
 TEST(CAPI, SingleHostServerDefWorks) {
-  // Create a server def that represents a 2-process cluster.
+  // Create a server def that represents a 2-process cluster and a client.
   // Example:
   //
-  // cluster { job { name: "localhost"
-  //               tasks { key: 0 value: "localhost:14319" } <--client
-  //               tasks { key: 1 value: "localhost:15022" } <--worker1
-  //               tasks { key: 2 value: "localhost:15023" } <--worker2
-  // } }
-  // job_name: "localhost" protocol: "grpc"
+  // cluster { job { name: "worker"
+  //                 tasks { key: 0 value: "localhost:14522" }
+  //                 tasks { key: 1 value: "localhost:14523" }
+  //               }
+  //           job { name: "client"
+  //                 tasks { key: 0 value: "localhost:14524" }
+  //               }
+  //         } job_name: "worker" protocol: "grpc"
   //
-  tensorflow::ServerDef cluster_server_def = GetServerDef(3);
-  // These server defs have task index set to 0.
-  string serialized_cluster_server_def = cluster_server_def.SerializeAsString();
+  tensorflow::ServerDef cluster_server_def = GetClusterServerDef("worker", 2);
 
   // Create two worker tasks, using single host server defs.
   // A single host server def contains a client and the remote host.
   // Example:
   //
-  //  Worker2:
-  //  cluster { job { name: "localhost"
-  //                  tasks { key: 0 value: "localhost:15226" } <--client
-  //                  tasks { key: 2 value: "localhost:15023" } <--worker2
-  //  } }
-  //  job_name: "localhost" task_index: 2 protocol: "grpc"
-  //
   //  Worker1:
-  //  cluster { job { name: "localhost"
-  //                 tasks { key: 0 value: "localhost:15024" } <--client
-  //                 tasks { key: 1 value: "localhost:15022" } <--worker1
-  //  } }
-  //  job_name: "localhost" task_index: 1 protocol: "grpc"
+  //  cluster { job { name: "client" tasks { key: 0 value: "localhost:14525" } }
+  //            job { name: "worker" tasks { key: 1 value: "localhost:14523" } }
+  //          } job_name: "worker" task_index: 1 protocol: "grpc"
+  //
+  //  Worker0:
+  //  cluster { job { name: "client" tasks { key: 0 value: "localhost:14526" } }
+  //            job { name: "worker" tasks { key: 0 value: "localhost:14522" } }
+  //          } job_name: "worker" protocol: "grpc"
   //
 
-  // Create `worker_2` using single host server def `worker_2_server_def`.
-  tensorflow::ServerDef worker_2_server_def =
-      CreateSingleHostServerDef(cluster_server_def, 2);
-  worker_2_server_def.set_task_index(2);
+  // Create `worker_1` using single host server def `worker_1_server_def`.
+  tensorflow::ServerDef worker_1_server_def =
+      CreateSingleHostServerDef(cluster_server_def, 1);
+  worker_1_server_def.set_task_index(1);
+  worker_1_server_def.set_job_name("worker");
 
-  std::unique_ptr<tensorflow::GrpcServer> worker_server2;
-  ASSERT_TRUE(tensorflow::GrpcServer::Create(worker_2_server_def,
+  std::unique_ptr<tensorflow::GrpcServer> worker_server1;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(worker_1_server_def,
                                              tensorflow::Env::Default(),
-                                             &worker_server2)
+                                             &worker_server1)
                   .ok());
-  ASSERT_TRUE(worker_server2->Start().ok());
+  ASSERT_TRUE(worker_server1->Start().ok());
 
   // Create context `local_ctx` using single host server def -
-  // `worker_2_server_def`.
-  worker_2_server_def.set_task_index(0);
+  // `worker_1_server_def`.
+  worker_1_server_def.set_task_index(0);
+  worker_1_server_def.set_job_name("client");
   TFE_Context* local_ctx =
-      CreateContext(worker_2_server_def.SerializeAsString(),
+      CreateContext(worker_1_server_def.SerializeAsString(),
                     /*isolate_session_state=*/false);
 
-  const char remote_device[] = "/job:localhost/replica:0/task:2/device:CPU:0";
+  const char remote_device[] = "/job:worker/replica:0/task:1/device:CPU:0";
 
   // Create a variable `var` on `worker2` using `local_ctx`.
   TFE_TensorHandle* handle_0 =
@@ -2270,21 +2427,24 @@ TEST(CAPI, SingleHostServerDefWorks) {
   TF_DeleteStatus(status);
   TFE_DeleteTensorHandle(handle_0);
 
-  // Create `worker1` using single host server def `worker_1_server_def`.
-  tensorflow::ServerDef worker_1_server_def =
-      CreateSingleHostServerDef(cluster_server_def, 1);
-  worker_1_server_def.set_task_index(1);
+  // Create `worker0` using single host server def `worker_0_server_def`.
+  tensorflow::ServerDef worker_0_server_def =
+      CreateSingleHostServerDef(cluster_server_def, 0);
+  worker_0_server_def.set_task_index(0);
 
-  std::unique_ptr<tensorflow::GrpcServer> worker_server1;
-  ASSERT_TRUE(tensorflow::GrpcServer::Create(worker_1_server_def,
+  std::unique_ptr<tensorflow::GrpcServer> worker_server0;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(worker_0_server_def,
                                              tensorflow::Env::Default(),
-                                             &worker_server1)
+                                             &worker_server0)
                   .ok());
-  ASSERT_TRUE(worker_server1->Start().ok());
+  ASSERT_TRUE(worker_server0->Start().ok());
 
   // Create a remote context, `remote_ctx`, using `cluster_server_def`.
-  TFE_Context* remote_ctx = CreateContext(serialized_cluster_server_def,
-                                          /*isolate_session_state=*/false);
+  cluster_server_def.set_task_index(0);
+  cluster_server_def.set_job_name("client");
+  TFE_Context* remote_ctx =
+      CreateContext(cluster_server_def.SerializeAsString(),
+                    /*isolate_session_state=*/false);
 
   // Read variable `var` using `remote_ctx`, created using `cluster_server_def`.
   {
@@ -2326,7 +2486,7 @@ TEST(CAPI, SingleHostServerDefWorks) {
   TFE_DeleteContext(remote_ctx);
 
   worker_server1.release();
-  worker_server2.release();
+  worker_server0.release();
 }
 
 }  // namespace
diff --git a/tensorflow/c/eager/c_api_test_util.h b/tensorflow/c/eager/c_api_test_util.h
index 04af1bd952c4e9..ce8546fb4f4186 100644
--- a/tensorflow/c/eager/c_api_test_util.h
+++ b/tensorflow/c/eager/c_api_test_util.h
@@ -119,7 +119,7 @@ TFE_Op* RecvOp(TFE_Context* ctx, const std::string& op_name,
                const std::string& send_device, const std::string& recv_device,
                tensorflow::uint64 send_device_incarnation);
 
-// Return an 1-D INT32 tensor containing a single value 1.
+// Return a 1-D INT32 tensor containing a single value 1.
 TFE_TensorHandle* TestAxisTensorHandle(TFE_Context* ctx);
 
 // Return an op taking minimum of `input` long `axis` dimension.
diff --git a/tensorflow/c/eager/c_api_unified_experimental.h b/tensorflow/c/eager/c_api_unified_experimental.h
index ee22695632fd12..41228f07e70fd4 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.h
+++ b/tensorflow/c/eager/c_api_unified_experimental.h
@@ -32,7 +32,7 @@ extern "C" {
 // -----------------------------------------------------------------------------
 
 // A TF_ExecutionContext stores knowledge about how to execute an operation.
-// E.g. it could know whether we're in eager mode or in graph mode, keeps track
+// E.g. it could know whether we're in eager mode or graph mode, keeps track
 // of gradient tapes, etc.
 typedef struct TF_ExecutionContext TF_ExecutionContext;
 
diff --git a/tensorflow/c/eager/gradients_internal.h b/tensorflow/c/eager/gradients_internal.h
index 5ddf017413a31d..1e14302c1721c1 100644
--- a/tensorflow/c/eager/gradients_internal.h
+++ b/tensorflow/c/eager/gradients_internal.h
@@ -24,7 +24,7 @@ namespace internal {
 
 // Helper functions which delegate to `AbstractOperation`, update
 // the state of the ForwardOperation and call the tape as appropriate.
-// These APIs are mainly to faciliate testing and are subject to change.
+// These APIs are mainly to facilitate testing and are subject to change.
 
 // Records the op name in the `ForwardOperation`.
 Status Reset(AbstractOperation*, const char* op, const char* raw_device_name,
diff --git a/tensorflow/c/eager/immediate_execution_tensor_handle.h b/tensorflow/c/eager/immediate_execution_tensor_handle.h
index 4a7586f0e5bce6..eab9314b3ec377 100644
--- a/tensorflow/c/eager/immediate_execution_tensor_handle.h
+++ b/tensorflow/c/eager/immediate_execution_tensor_handle.h
@@ -81,7 +81,7 @@ class ImmediateExecutionTensorHandle : public AbstractTensorHandle {
   // Release any underlying resources, including the interface object.
   //
   // WARNING: The destructor of this class is marked as protected to disallow
-  // clients from directly destroying this object since it may manage it's own
+  // clients from directly destroying this object since it may manage its own
   // lifetime through ref counting. Thus this must be allocated on the heap and
   // clients MUST call Release() in order to destroy an instance of this class.
   virtual void Release() = 0;
diff --git a/tensorflow/cc/gradients/README.md b/tensorflow/cc/gradients/README.md
index 3253163cc735cf..e2f7badcfebcfe 100644
--- a/tensorflow/cc/gradients/README.md
+++ b/tensorflow/cc/gradients/README.md
@@ -13,31 +13,35 @@ below.
 
 2.  Write the op gradient with the following naming scheme:
 
-        Status OpNameGrad(const Scope& scope, const Operation& op,
-                          const std::vector<Output>& grad_inputs,
-                          std::vector<Output>* grad_outputs) {
-          ...
-          return scope.status();
-        }
-        REGISTER_GRADIENT_OP("OpName", OpNameGrad);
-
-3.  Ops gradients are implemented by using the [C++
-    API](https://www.tensorflow.org/api_docs/cc/).
+    ```
+    Status OpNameGrad(const Scope& scope, const Operation& op,
+                      const std::vector<Output>& grad_inputs,
+                      std::vector<Output>* grad_outputs) {
+      ...
+      return scope.status();
+    }
+    REGISTER_GRADIENT_OP("OpName", OpNameGrad);
+    ```
+
+3.  Ops gradients are implemented by using the
+    [C++ API](https://www.tensorflow.org/api_docs/cc/).
 
 4.  Tests should be included in `foo_grad_test.cc`. Please see
     [`array_grad_test.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/gradients/array_grad_test.cc)
-    for an many examples. Tests are as simple as, creating a placeholder input
-    for the op's inputs and calling `RunTest` (`RunTest` uses a [gradient
-    checker](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/framework/gradient_checker.cc)
+    for many examples. Tests are as simple as, creating a placeholder input for
+    the op's inputs and calling `RunTest` (`RunTest` uses a
+    [gradient checker](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/cc/framework/gradient_checker.cc)
     to verify that the theoretical gradient matches the numeric gradient). For
     example:
 
-        TEST_F(ArrayGradTest, IdentityGrad) {
-          TensorShape shape({5, 2});
-          auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
-          auto y = Identity(scope_, x);
-          RunTest(x, shape, y, shape);
-        }
+    ```
+    TEST_F(ArrayGradTest, IdentityGrad) {
+      TensorShape shape({5, 2});
+      auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+      auto y = Identity(scope_, x);
+      RunTest(x, shape, y, shape);
+    }
+    ```
 
 NOTE: There are some ops that require features from the C++ API that are not yet
 implemented.
diff --git a/tensorflow/compiler/jit/compilability_check_util.cc b/tensorflow/compiler/jit/compilability_check_util.cc
index 53a997de688adb..ecb5fcf3a3e840 100644
--- a/tensorflow/compiler/jit/compilability_check_util.cc
+++ b/tensorflow/compiler/jit/compilability_check_util.cc
@@ -491,6 +491,14 @@ bool RecursiveCompilabilityChecker::IsCompilableNode(
     return false;
   }
 
+  if (!op_filter_.allow_where_op && node.type_string() == "Where") {
+    absl::string_view uncompilable_reason = "Where op";
+    MaybeMarkUncompilableNode(uncompilable_reason, *stack_trace,
+                              encapsulating_function, uncompilable_nodes);
+    LogNotCompilable(node, uncompilable_reason);
+    return false;
+  }
+
   if (!op_filter_.allow_ops_producing_or_consuming_variant &&
       OpProducesOrConsumesVariant(node)) {
     absl::string_view uncompilable_reason = "DT_VARIANT producer/consumer";
diff --git a/tensorflow/compiler/jit/compilability_check_util.h b/tensorflow/compiler/jit/compilability_check_util.h
index d25444a5bf4216..687add5d2714cb 100644
--- a/tensorflow/compiler/jit/compilability_check_util.h
+++ b/tensorflow/compiler/jit/compilability_check_util.h
@@ -136,6 +136,13 @@ class RecursiveCompilabilityChecker {
     // Whether to allow the compilation of CollectiveReduceV2Op.
     bool allow_collective_reduce_v2 = true;
 
+    // Whether to allow the compilation of WhereOp. Compilation of the WhereOp
+    // generates output with bounded dynamic shape that may cause failures with
+    // auto clustering.
+    // TODO(b/203693252): Enable tf.where during autoclustering after all the
+    // legalization issues are fixed.
+    bool allow_where_op = true;
+
     // Whether ops that are marked as outside compiled are always considered
     // compilable.
     // TODO(b/191502757):  Make this behavior true by default and remove this
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index acd20bde8806cd..843eccfa0c1f89 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -1203,6 +1203,7 @@ Status MarkForCompilationPassImpl::FindCompilationCandidates() {
     filter.require_always_compilable = true;
     filter.allow_string_consts = false;
     filter.allow_collective_reduce_v2 = false;
+    filter.allow_where_op = false;
 
     RecursiveCompilabilityChecker checker(
         filter, DeviceType{registration->compilation_device_name});
diff --git a/tensorflow/compiler/mlir/hlo/BUILD b/tensorflow/compiler/mlir/hlo/BUILD
index 595d358d8caaad..9f8133894843d7 100644
--- a/tensorflow/compiler/mlir/hlo/BUILD
+++ b/tensorflow/compiler/mlir/hlo/BUILD
@@ -930,7 +930,23 @@ cc_library(
     deps = [
         ":hlo",
         ":lhlo",
-        ":map_hlo_to_lhlo_op",
+        ":map_lhlo_to_hlo_op",
+        ":map_mhlo_to_scalar_op",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithmeticDialect",
+        "@llvm-project//mlir:ComplexDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:MathDialect",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:StandardOps",
+    ],
+)
+
+cc_library(
+    name = "map_mhlo_to_scalar_op",
+    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/map_mhlo_to_scalar_op.h"],
+    deps = [
+        ":hlo",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithmeticDialect",
         "@llvm-project//mlir:ComplexDialect",
@@ -959,6 +975,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "map_lhlo_to_hlo_op",
+    hdrs = ["include/mlir-hlo/Dialect/mhlo/transforms/map_lhlo_to_hlo_op.h"],
+    deps = [
+        ":hlo",
+        ":lhlo",
+    ],
+)
+
 cc_library(
     name = "lhlo_legalize_to_affine",
     srcs = ["lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc"],
@@ -1115,8 +1140,7 @@ cc_library(
     ],
     deps = [
         ":hlo",
-        ":lhlo",
-        ":map_lmhlo_to_scalar_op",
+        ":map_mhlo_to_scalar_op",
         ":pass_details",
         ":type_conversion",
         "@llvm-project//llvm:Support",
@@ -1253,6 +1277,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithmeticDialect",
         "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:BufferizationTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:Pass",
@@ -1280,6 +1305,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithmeticDialect",
         "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:BufferizationTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:Pass",
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td
index eed6fe34d7a9ef..7a6bbd4aa3100a 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td
@@ -1491,6 +1491,13 @@ def HLO_CustomCallOp: HLO_Op<"custom_call", []> {
     `call_target_name` should be short as it may be used in labels.
     `backend_config` can encode arbitrarily large amounts of information.
 
+    `has_side_effect` must be true if the custom call has side-effects.
+    `api_version` specifies the version of the API used by the custom call
+    function.
+
+    A custom call may apply functions within the scope of the parent module.
+    They can be referenced using `called_computations` attribute.
+
     A custom call can also have layout constraints on operands and results which
     can be specified as optional `operand_layouts` and `result_layouts`
     attributes. The layout attribute is an array of rank-1 index tensors and the
@@ -1517,6 +1524,7 @@ def HLO_CustomCallOp: HLO_Op<"custom_call", []> {
     DefaultValuedAttr<HLO_CustomCallApiVersionAttr,
                       "CustomCallApiVersion::API_VERSION_ORIGINAL">:
                       $api_version,
+    DefaultValuedAttr<HLO_FlatSymbolRefArrayAttr, "{}">:$called_computations,
     OptionalAttr<HLO_ArrayOfLayoutAttr>:$operand_layouts,
     OptionalAttr<HLO_ArrayOfLayoutAttr>:$result_layouts
   );
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td
index 6ec6c818afe698..15de7bf33f9989 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td
@@ -115,6 +115,13 @@ def HLO_LayoutAttr : Attr<
 def HLO_ArrayOfLayoutAttr : TypedArrayAttrBase<HLO_LayoutAttr,
     "Array of layout (1D tensor of index type) attributes">;
 
+// An array of FlatSymbolRef attributes that can be used as a default valued
+// attribute.
+def HLO_FlatSymbolRefArrayAttr :
+  TypedArrayAttrBase<FlatSymbolRefAttr, "flat symbol ref array attribute"> {
+  let constBuilderCall = "::mlir::ArrayAttr::get($_builder.getContext(), $0)";
+}
+
 
 //===----------------------------------------------------------------------===//
 // Common convolution attributes
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/lmhlo_passes.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/lmhlo_passes.td
index 8a91a549f51ad1..454925588a6bb8 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/lmhlo_passes.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/lmhlo_passes.td
@@ -15,12 +15,6 @@ limitations under the License.
 
 include "mlir/Pass/PassBase.td"
 
-def LhloLegalizeToLinalgPass : FunctionPass<"lhlo-legalize-to-linalg"> {
-  let summary = "Legalize from LHLO dialect to Linalg dialect.";
-  let constructor = "createLegalizeLhloToLinalgPass()";
-}
-
-
 def LhloFuseLinalgPass : FunctionPass<"lhlo-fuse-linalg"> {
   let summary = "Greedily fuse linalg ops obtained after LHLO lowering.";
   let constructor = "createLhloFuseLinalgPass()";
@@ -33,7 +27,6 @@ def LhloFuseLinalgPass : FunctionPass<"lhlo-fuse-linalg"> {
   ];
 }
 
-
 def LhloLegalizeToAffinePass : FunctionPass<"lhlo-legalize-to-affine"> {
   let summary = "Legalize from LHLO dialect to affine dialect.";
   let constructor = "createLhloLegalizeToAffinePass()";
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lhlo_to_hlo_op.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lhlo_to_hlo_op.h
new file mode 100644
index 00000000000000..248fc18bda9f0b
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lhlo_to_hlo_op.h
@@ -0,0 +1,105 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_LHLO_TO_HLO_OP_H_
+#define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_LHLO_TO_HLO_OP_H_
+
+#include <type_traits>
+
+#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+
+namespace mlir {
+namespace lmhlo {
+
+template <typename LhloOpTy>
+struct LhloToHloOpImpl {
+  using Type = std::false_type;
+};
+template <typename LhloOpTy>
+using LhloToHloOp = typename LhloToHloOpImpl<LhloOpTy>::Type;
+
+#define MAP_LHLO_TO_HLO(OpName)           \
+  template <>                             \
+  struct LhloToHloOpImpl<lmhlo::OpName> { \
+    using Type = mhlo::OpName;            \
+  }
+
+MAP_LHLO_TO_HLO(AbsOp);
+MAP_LHLO_TO_HLO(AddOp);
+MAP_LHLO_TO_HLO(AndOp);
+MAP_LHLO_TO_HLO(Atan2Op);
+MAP_LHLO_TO_HLO(BitcastConvertOp);
+MAP_LHLO_TO_HLO(BroadcastInDimOp);
+MAP_LHLO_TO_HLO(CeilOp);
+MAP_LHLO_TO_HLO(ClampOp);
+MAP_LHLO_TO_HLO(ConstOp);
+MAP_LHLO_TO_HLO(CompareOp);
+MAP_LHLO_TO_HLO(ComplexOp);
+MAP_LHLO_TO_HLO(ConcatenateOp);
+MAP_LHLO_TO_HLO(ConvOp);
+MAP_LHLO_TO_HLO(ConvertOp);
+MAP_LHLO_TO_HLO(CopyOp);
+MAP_LHLO_TO_HLO(CosOp);
+MAP_LHLO_TO_HLO(CustomCallOp);
+MAP_LHLO_TO_HLO(DivOp);
+MAP_LHLO_TO_HLO(DotOp);
+MAP_LHLO_TO_HLO(DynamicBroadcastInDimOp);
+MAP_LHLO_TO_HLO(DynamicGatherOp);
+MAP_LHLO_TO_HLO(DynamicIotaOp);
+MAP_LHLO_TO_HLO(DynamicPadOp);
+MAP_LHLO_TO_HLO(DynamicReshapeOp);
+MAP_LHLO_TO_HLO(ExpOp);
+MAP_LHLO_TO_HLO(Expm1Op);
+MAP_LHLO_TO_HLO(FloorOp);
+MAP_LHLO_TO_HLO(GatherOp);
+MAP_LHLO_TO_HLO(ImagOp);
+MAP_LHLO_TO_HLO(IotaOp);
+MAP_LHLO_TO_HLO(IsFiniteOp);
+MAP_LHLO_TO_HLO(LogOp);
+MAP_LHLO_TO_HLO(LogisticOp);
+MAP_LHLO_TO_HLO(Log1pOp);
+MAP_LHLO_TO_HLO(MaxOp);
+MAP_LHLO_TO_HLO(MinOp);
+MAP_LHLO_TO_HLO(MulOp);
+MAP_LHLO_TO_HLO(NegOp);
+MAP_LHLO_TO_HLO(NotOp);
+MAP_LHLO_TO_HLO(OrOp);
+MAP_LHLO_TO_HLO(PowOp);
+MAP_LHLO_TO_HLO(RealDynamicSliceOp);
+MAP_LHLO_TO_HLO(RealOp);
+MAP_LHLO_TO_HLO(ReduceOp);
+MAP_LHLO_TO_HLO(ReshapeOp);
+MAP_LHLO_TO_HLO(RemOp);
+MAP_LHLO_TO_HLO(RsqrtOp);
+MAP_LHLO_TO_HLO(SelectOp);
+MAP_LHLO_TO_HLO(ShiftLeftOp);
+MAP_LHLO_TO_HLO(ShiftRightArithmeticOp);
+MAP_LHLO_TO_HLO(ShiftRightLogicalOp);
+MAP_LHLO_TO_HLO(SignOp);
+MAP_LHLO_TO_HLO(SinOp);
+MAP_LHLO_TO_HLO(SliceOp);
+MAP_LHLO_TO_HLO(SqrtOp);
+MAP_LHLO_TO_HLO(SubOp);
+MAP_LHLO_TO_HLO(TanhOp);
+MAP_LHLO_TO_HLO(TransposeOp);
+MAP_LHLO_TO_HLO(XorOp);
+
+#undef MAP_LHLO_TO_HLO
+
+}  // namespace lmhlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_LHLO_TO_HLO_OP_H_
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h
index 967036d7747849..857cc03ad2b9d5 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h
@@ -16,843 +16,21 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_LMHLO_TO_SCALAR_OP_H_
 #define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_LMHLO_TO_SCALAR_OP_H_
 
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/iterator_range.h"
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/map_hlo_to_lhlo_op.h"
-#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
-#include "mlir/Dialect/Complex/IR/Complex.h"
-#include "mlir/Dialect/Math/IR/Math.h"
-#include "mlir/Dialect/SCF/SCF.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
-#include "mlir/IR/TypeUtilities.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/map_lhlo_to_hlo_op.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/map_mhlo_to_scalar_op.h"
 
 namespace mlir {
 namespace lmhlo {
-namespace impl {
 
-// A struct to map LhloBinaryOpTy type to the corresponding floating-point and
-// integer scalar operation types.
-template <typename LhloBinaryOpTy>
-struct LhloToScalarOp {
-  using FOp = void;
-  using IOp = void;
-  using UOp = void;
-  using COp = void;
-};
-
-template <>
-struct LhloToScalarOp<lmhlo::AddOp> {
-  using FOp = ::mlir::arith::AddFOp;
-  using IOp = ::mlir::arith::AddIOp;
-  using UOp = ::mlir::arith::AddIOp;
-  using COp = ::mlir::complex::AddOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::AndOp> {
-  using IOp = ::mlir::arith::AndIOp;
-  using UOp = ::mlir::arith::AndIOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::CompareOp> {
-  using FOp = ::mlir::arith::CmpFOp;
-  using IOp = ::mlir::arith::CmpIOp;
-  using UOp = ::mlir::arith::CmpIOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::CeilOp> {
-  using FOp = ::mlir::math::CeilOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::CosOp> {
-  using FOp = ::mlir::math::CosOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::DivOp> {
-  using FOp = ::mlir::arith::DivFOp;
-  using IOp = ::mlir::arith::DivSIOp;
-  using UOp = ::mlir::arith::DivUIOp;
-  using COp = ::mlir::complex::DivOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::ExpOp> {
-  using FOp = ::mlir::math::ExpOp;
-  using COp = ::mlir::complex::ExpOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::Expm1Op> {
-  using FOp = ::mlir::math::ExpM1Op;
-};
-template <>
-struct LhloToScalarOp<lmhlo::FloorOp> {
-  using FOp = ::mlir::math::FloorOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::MaxOp> {
-  using FOp = ::mlir::arith::MaxFOp;
-  using IOp = ::mlir::arith::MaxSIOp;
-  using UOp = ::mlir::arith::MaxUIOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::MinOp> {
-  using FOp = ::mlir::arith::MinFOp;
-  using IOp = ::mlir::arith::MinSIOp;
-  using UOp = ::mlir::arith::MinUIOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::LogOp> {
-  using FOp = ::mlir::math::LogOp;
-  using COp = ::mlir::complex::LogOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::Log1pOp> {
-  using FOp = ::mlir::math::Log1pOp;
-  using COp = ::mlir::complex::Log1pOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::MulOp> {
-  using FOp = ::mlir::arith::MulFOp;
-  using IOp = ::mlir::arith::MulIOp;
-  using UOp = ::mlir::arith::MulIOp;
-  using COp = ::mlir::complex::MulOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::OrOp> {
-  using IOp = ::mlir::arith::OrIOp;
-  using UOp = ::mlir::arith::OrIOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::RemOp> {
-  using FOp = ::mlir::arith::RemFOp;
-  using IOp = ::mlir::arith::RemSIOp;
-  using UOp = ::mlir::arith::RemUIOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::RsqrtOp> {
-  using FOp = ::mlir::math::RsqrtOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::SubOp> {
-  using FOp = ::mlir::arith::SubFOp;
-  using IOp = ::mlir::arith::SubIOp;
-  using UOp = ::mlir::arith::SubIOp;
-  using COp = ::mlir::complex::SubOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::SqrtOp> {
-  using FOp = ::mlir::math::SqrtOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::SinOp> {
-  using FOp = ::mlir::math::SinOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::ShiftLeftOp> {
-  using IOp = ::mlir::arith::ShLIOp;
-  using UOp = ::mlir::arith::ShLIOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::ShiftRightArithmeticOp> {
-  using IOp = ::mlir::arith::ShRSIOp;
-  using UOp = ::mlir::arith::ShRSIOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::ShiftRightLogicalOp> {
-  using IOp = ::mlir::arith::ShRUIOp;
-  using UOp = ::mlir::arith::ShRUIOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::Atan2Op> {
-  using FOp = ::mlir::math::Atan2Op;
-};
-template <>
-struct LhloToScalarOp<lmhlo::TanhOp> {
-  using FOp = ::mlir::math::TanhOp;
-};
-template <>
-struct LhloToScalarOp<lmhlo::XorOp> {
-  using IOp = ::mlir::arith::XOrIOp;
-  using UOp = ::mlir::arith::XOrIOp;
-};
-
-// Alias for the map from LHLO binary op type to STD floating-point op type.
-template <typename LhloOp>
-using ScalarFOp = typename LhloToScalarOp<LhloOp>::FOp;
-// Alias for the map from LHLO binary op type to STD signed integer op type.
-template <typename LhloOp>
-using ScalarIOp = typename LhloToScalarOp<LhloOp>::IOp;
-// Alias for the map from LHLO binary op type to STD unsigned integer op type.
-template <typename LhloOp>
-using ScalarUOp = typename LhloToScalarOp<LhloOp>::UOp;
-// Alias for the map from LHLO binary op type to STD complex op type.
-template <typename LhloOp>
-using ScalarCOp = typename LhloToScalarOp<LhloOp>::COp;
-
-template <typename... Args>
-struct MapLhloOpToScalarOpImpl {
-  Value operator()(Location loc, ArrayRef<Type> result_types,
-                   ArrayRef<Type> arg_types, ValueRange args, OpBuilder* b) {
-    return nullptr;
-  }
-};
-
-template <typename StdScalarOp>
-struct MapLhloOpToScalarOpImpl<StdScalarOp> {
-  Value operator()(Location loc, ArrayRef<Type> result_types,
-                   ArrayRef<Type> arg_types, ValueRange args, OpBuilder* b) {
-    return b->template create<StdScalarOp>(loc, result_types, args, mlir::None);
-  }
-};
-
-template <typename SupportedType, typename StdScalarOp, typename... Args>
-struct MapLhloOpToScalarOpImpl<SupportedType, StdScalarOp, Args...> {
-  Value operator()(Location loc, ArrayRef<Type> result_types,
-                   ArrayRef<Type> arg_types, ValueRange args, OpBuilder* b) {
-    Type element_type = getElementTypeOrSelf(arg_types.front());
-    if (SupportedType{}(element_type)) {
-      return b->template create<StdScalarOp>(loc, result_types, args,
-                                             mlir::None);
-    }
-    return MapLhloOpToScalarOpImpl<Args...>{}(loc, result_types, arg_types,
-                                              args, b);
-  }
-};
-
-template <typename SupportedType, typename... Args>
-struct MapLhloOpToScalarOpImpl<SupportedType, void, Args...> {
-  Value operator()(Location loc, ArrayRef<Type> result_types,
-                   ArrayRef<Type> arg_types, ValueRange args, OpBuilder* b) {
-    return MapLhloOpToScalarOpImpl<Args...>{}(loc, result_types, arg_types,
-                                              args, b);
-  }
-};
-
-struct isAnyIntegerType {
-  bool operator()(Type t) { return t.isa<IntegerType>(); }
-};
-
-struct isSignedIntegerType {
-  bool operator()(Type t) {
-    // Pretend that signless is signed. This will change eventually.
-    return t.isa<IntegerType>() && !t.isUnsignedInteger();
-  }
-};
-
-struct isUnsignedIntegerType {
-  bool operator()(Type t) { return t.isUnsignedInteger(); }
-};
-
-struct isFloatType {
-  bool operator()(Type t) { return t.isa<FloatType>(); }
-};
-
-struct isComplexType {
-  bool operator()(Type t) { return t.isa<ComplexType>(); }
-};
-
-template <template <typename T> class MapTy, typename OpTy,
-          typename PredTy = llvm::is_detected<MapTy, OpTy>>
-struct MapableIf {
-  using type = void;
-};
-template <template <typename T> class MapTy, typename OpTy>
-struct MapableIf<MapTy, OpTy, std::true_type> {
-  using type = MapTy<OpTy>;
-};
-
-// Inserts the computation that corresponds to the body of the loop for lowered
-// LHLO unary/binary op. Returns the value for the result.
-template <typename LhloOpTy>
-inline Value MapLhloOpToStdScalarOp(Location loc, ArrayRef<Type> result_types,
-                                    ArrayRef<Type> arg_types, ValueRange args,
-                                    OpBuilder* b) {
-  using ScalarIOpOrVoid = typename MapableIf<ScalarIOp, LhloOpTy>::type;
-  using ScalarUOpOrVoid = typename MapableIf<ScalarUOp, LhloOpTy>::type;
-  using ScalarFOpOrVoid = typename MapableIf<ScalarFOp, LhloOpTy>::type;
-  using ScalarCOpOrVoid = typename MapableIf<ScalarCOp, LhloOpTy>::type;
-  return MapLhloOpToScalarOpImpl<isSignedIntegerType, ScalarIOpOrVoid,
-                                 isUnsignedIntegerType, ScalarUOpOrVoid,
-                                 isFloatType, ScalarFOpOrVoid, isComplexType,
-                                 ScalarCOpOrVoid>{}(loc, result_types,
-                                                    arg_types, args, b);
-}
-
-template <>
-inline Value MapLhloOpToStdScalarOp<lmhlo::AbsOp>(Location loc,
-                                                  ArrayRef<Type> result_types,
-                                                  ArrayRef<Type> arg_types,
-                                                  ValueRange args,
-                                                  OpBuilder* b) {
-  Type element_type = getElementTypeOrSelf(arg_types.front());
-  if (element_type.isa<FloatType>()) {
-    return MapLhloOpToScalarOpImpl<isFloatType, ::mlir::math::AbsOp>{}(
-        loc, result_types, arg_types, args, b);
-  }
-  if (element_type.isa<ComplexType>()) {
-    return MapLhloOpToScalarOpImpl<isComplexType, ::mlir::complex::AbsOp>{}(
-        loc, result_types, arg_types, args, b);
-  }
-  if (element_type.isSignlessInteger() || element_type.isSignedInteger()) {
-    // lmhlo.abs(x, result) ->  result = select((x > 0), x, sub(0, x))
-    Value lhs = args[0];
-    auto integer_type = element_type.dyn_cast<IntegerType>();
-
-    Value zero_intval = b->create<::mlir::arith::ConstantIntOp>(
-        loc, 0, integer_type.getWidth());
-    if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
-      zero_intval = b->create<::mlir::SplatOp>(loc, vec_type, zero_intval);
-    }
-    auto lhs_gt_zero = b->create<ScalarIOp<CompareOp>>(
-        loc, arith::CmpIPredicate::sge, lhs, zero_intval);
-    auto neg_val = b->create<ScalarIOp<lmhlo::SubOp>>(loc, zero_intval, lhs);
-    return b->create<::mlir::SelectOp>(loc, lhs_gt_zero, lhs, neg_val);
-  }
-  return nullptr;
-}
-
-template <typename PredicateType>
-inline Optional<PredicateType> getCmpPredicate(StringRef, bool) {
-  return llvm::None;
-}
-
-template <>
-inline Optional<arith::CmpFPredicate> getCmpPredicate<arith::CmpFPredicate>(
-    StringRef comparison_direction, bool is_signed) {
-  assert(is_signed && "cannot have an unsigned float!");
-  return llvm::StringSwitch<Optional<arith::CmpFPredicate>>(
-             comparison_direction)
-      .Case("EQ", arith::CmpFPredicate::OEQ)
-      .Case("NE", arith::CmpFPredicate::UNE)
-      .Case("GE", arith::CmpFPredicate::OGE)
-      .Case("GT", arith::CmpFPredicate::OGT)
-      .Case("LE", arith::CmpFPredicate::OLE)
-      .Case("LT", arith::CmpFPredicate::OLT)
-      .Default(llvm::None);
-}
-
-template <>
-inline Optional<arith::CmpIPredicate> getCmpPredicate<arith::CmpIPredicate>(
-    StringRef comparison_direction, bool is_signed) {
-  return llvm::StringSwitch<Optional<arith::CmpIPredicate>>(
-             comparison_direction)
-      .Case("EQ", arith::CmpIPredicate::eq)
-      .Case("NE", arith::CmpIPredicate::ne)
-      .Case("GE",
-            is_signed ? arith::CmpIPredicate::sge : arith::CmpIPredicate::uge)
-      .Case("GT",
-            is_signed ? arith::CmpIPredicate::sgt : arith::CmpIPredicate::ugt)
-      .Case("LE",
-            is_signed ? arith::CmpIPredicate::sle : arith::CmpIPredicate::ule)
-      .Case("LT",
-            is_signed ? arith::CmpIPredicate::slt : arith::CmpIPredicate::ult)
-      .Default(llvm::None);
-}
-
-template <typename CompareOpTy>
-inline Value MapCompareOpToStdScalarOp(Location loc,
-                                       StringRef comparison_direction,
-                                       ArrayRef<Type> result_types,
-                                       ArrayRef<Type> arg_types,
-                                       ValueRange args, OpBuilder* b) {
-  const auto& lhs = args[0];
-  const auto& rhs = args[1];
-  Type element_type = getElementTypeOrSelf(arg_types.front());
-  if (element_type.isa<IntegerType>()) {
-    Optional<arith::CmpIPredicate> predicate =
-        getCmpPredicate<arith::CmpIPredicate>(
-            comparison_direction, !element_type.isUnsignedInteger());
-    assert(predicate.hasValue() && "expected valid comparison direction");
-    return b->create<ScalarIOp<CompareOpTy>>(loc, predicate.getValue(), lhs,
-                                             rhs);
-  }
-  if (element_type.isa<FloatType>()) {
-    Optional<arith::CmpFPredicate> predicate =
-        getCmpPredicate<arith::CmpFPredicate>(comparison_direction,
-                                              /*is_signed=*/true);
-    assert(predicate.hasValue() && "expected valid comparison direction");
-    return b->create<ScalarFOp<CompareOpTy>>(loc, predicate.getValue(), lhs,
-                                             rhs);
-  }
-  if (auto complex_type = element_type.dyn_cast<ComplexType>()) {
-    if (complex_type.getElementType().isa<FloatType>()) {
-      if (comparison_direction == "EQ") {
-        return b->create<complex::EqualOp>(loc, lhs, rhs);
-      }
-      if (comparison_direction == "NE") {
-        return b->create<complex::NotEqualOp>(loc, lhs, rhs);
-      }
-    }
-  }
-  return nullptr;
-}
-
-template <>
-inline Value MapLhloOpToStdScalarOp<lmhlo::CopyOp>(Location loc,
-                                                   ArrayRef<Type> result_types,
-                                                   ArrayRef<Type> arg_types,
-                                                   ValueRange args,
-                                                   OpBuilder* b) {
-  return args.front();
-}
-
-template <>
-inline Value MapLhloOpToStdScalarOp<lmhlo::ComplexOp>(
-    Location loc, ArrayRef<Type> result_types, ArrayRef<Type> arg_types,
-    ValueRange args, OpBuilder* b) {
-  return MapLhloOpToScalarOpImpl<complex::CreateOp>{}(loc, result_types,
-                                                      arg_types, args, b);
-}
-
-template <>
-inline Value MapLhloOpToStdScalarOp<lmhlo::RealOp>(Location loc,
-                                                   ArrayRef<Type> result_types,
-                                                   ArrayRef<Type> arg_types,
-                                                   ValueRange args,
-                                                   OpBuilder* b) {
-  return MapLhloOpToScalarOpImpl<complex::ReOp>{}(loc, result_types, arg_types,
-                                                  args, b);
-}
-
-template <>
-inline Value MapLhloOpToStdScalarOp<lmhlo::ImagOp>(Location loc,
-                                                   ArrayRef<Type> result_types,
-                                                   ArrayRef<Type> arg_types,
-                                                   ValueRange args,
-                                                   OpBuilder* b) {
-  return MapLhloOpToScalarOpImpl<complex::ImOp>{}(loc, result_types, arg_types,
-                                                  args, b);
-}
-
-template <>
-inline Value MapLhloOpToStdScalarOp<lmhlo::ConvertOp>(
-    Location loc, ArrayRef<Type> result_types, ArrayRef<Type> arg_types,
-    ValueRange args, OpBuilder* b) {
-  Type sourceType = getElementTypeOrSelf(arg_types.front());
-  Type targetType = getElementTypeOrSelf(result_types.front());
-  Type convertedSourceType = getElementTypeOrSelf(args.front());
-
-  // A boolean value is considered to be unsigned when converting to
-  // floating-point. Otherwise, it will become `-1`.
-  if ((sourceType.isInteger(/*width=*/1) || sourceType.isUnsignedInteger()) &&
-      mlir::arith::UIToFPOp::areCastCompatible(convertedSourceType,
-                                               targetType)) {
-    return b->create<mlir::arith::UIToFPOp>(loc, result_types, args,
-                                            mlir::None);
-  } else if (mlir::arith::SIToFPOp::areCastCompatible(convertedSourceType,
-                                                      targetType)) {
-    return b->create<mlir::arith::SIToFPOp>(loc, result_types, args,
-                                            mlir::None);
-  } else if (sourceType.isa<FloatType>() && targetType.isa<FloatType>()) {
-    FloatType src = sourceType.cast<FloatType>();
-    FloatType res = targetType.cast<FloatType>();
-    if (src.getWidth() > res.getWidth()) {
-      return b->create<mlir::arith::TruncFOp>(loc, result_types, args,
-                                              mlir::None);
-    } else if (src.getWidth() < res.getWidth()) {
-      return b->create<mlir::arith::ExtFOp>(loc, result_types, args,
-                                            mlir::None);
-    }
-    // No conversion is needed for the same width floats
-    return args.front();
-  }
-  if (targetType.isInteger(/*width=*/1)) {
-    // When casting to bool, we need to compare whether the value is equal to
-    // zero.
-    if (sourceType.isSignlessInteger() || sourceType.isUnsignedInteger()) {
-      Value zero_intval = b->create<::mlir::arith::ConstantIntOp>(
-          loc, 0, sourceType.cast<IntegerType>().getWidth());
-      if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
-        zero_intval = b->create<::mlir::SplatOp>(loc, vec_type, zero_intval);
-      }
-      return b->create<mlir::arith::CmpIOp>(loc, arith::CmpIPredicate::ne,
-                                            args.front(), zero_intval);
-    } else if (sourceType.isa<FloatType>()) {
-      Value zero =
-          b->create<arith::ConstantOp>(loc, b->getFloatAttr(sourceType, 0.0));
-      if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
-        zero = b->create<::mlir::SplatOp>(loc, vec_type, zero);
-      }
-      return b->create<mlir::arith::CmpFOp>(loc, arith::CmpFPredicate::UNE,
-                                            args.front(), zero);
-    }
-  }
-  if (sourceType.isa<IntegerType>() && targetType.isa<IntegerType>()) {
-    IntegerType src = sourceType.cast<IntegerType>();
-    IntegerType res = targetType.cast<IntegerType>();
-    if (src.getWidth() > res.getWidth()) {
-      return b->create<mlir::arith::TruncIOp>(loc, result_types, args,
-                                              mlir::None);
-    } else if (src.getWidth() < res.getWidth()) {
-      // Special case boolean values, so they get casted to `1` instead of `-1`.
-      if (src.isUnsignedInteger() || src.getWidth() == 1) {
-        return b->create<mlir::arith::ExtUIOp>(loc, result_types, args,
-                                               mlir::None);
-      }
-      return b->create<mlir::arith::ExtSIOp>(loc, result_types, args,
-                                             mlir::None);
-    }
-    // No conversion is needed for the same width integers
-    return args.front();
-  }
-  if (mlir::arith::FPToSIOp::areCastCompatible(convertedSourceType,
-                                               targetType)) {
-    return b->create<mlir::arith::FPToSIOp>(loc, result_types, args,
-                                            mlir::None);
-  }
-  return nullptr;
-}
-
-template <>
-inline Value MapLhloOpToStdScalarOp<lmhlo::BitcastConvertOp>(
-    Location loc, ArrayRef<Type> result_types, ArrayRef<Type>, ValueRange args,
-    OpBuilder* b) {
-  return b->create<mlir::arith::BitcastOp>(loc, result_types, args);
-}
-
-template <>
-inline Value MapLhloOpToStdScalarOp<lmhlo::DotOp>(Location loc,
-                                                  ArrayRef<Type> result_types,
-                                                  ArrayRef<Type> arg_types,
-                                                  ValueRange args,
-                                                  OpBuilder* b) {
-  // Dot Op converter from lhlo to affine only accepts float and integer types.
-  const auto& lhs = args[0];
-  const auto& rhs = args[1];
-  const auto& result = args[2];
-  Type element_type = lhs.getType();
-  if (element_type.isa<FloatType>()) {
-    Value float_mul =
-        MapLhloOpToScalarOpImpl<isFloatType, ::mlir::arith::MulFOp>{}(
-            loc, result_types, arg_types, {lhs, rhs}, b);
-    return MapLhloOpToScalarOpImpl<isFloatType, ::mlir::arith::AddFOp>{}(
-        loc, result_types, arg_types, {float_mul, result}, b);
-  }
-  if (element_type.isa<IntegerType>()) {
-    Value int_mul =
-        MapLhloOpToScalarOpImpl<isAnyIntegerType, ::mlir::arith::MulIOp>{}(
-            loc, result_types, arg_types, {lhs, rhs}, b);
-    return MapLhloOpToScalarOpImpl<isAnyIntegerType, ::mlir::arith::AddIOp>{}(
-        loc, result_types, arg_types, {int_mul, result}, b);
-  }
-  return nullptr;
-}
-
-template <>
-inline Value MapLhloOpToStdScalarOp<lmhlo::IsFiniteOp>(
-    Location loc, ArrayRef<Type> result_types, ArrayRef<Type> arg_types,
-    ValueRange args, OpBuilder* b) {
-  if (args[0].getType().isa<FloatType>()) {
-    auto pos_inf = APFloat::getInf(
-        args[0].getType().cast<FloatType>().getFloatSemantics());
-    auto const_pos_inf = b->create<arith::ConstantOp>(
-        loc, b->getFloatAttr(args[0].getType(), pos_inf));
-    Value abs_x = b->create<::mlir::math::AbsOp>(loc, args[0]);
-    return b->create<::mlir::arith::CmpFOp>(loc, arith::CmpFPredicate::ONE,
-                                            abs_x, const_pos_inf);
-  }
-  return nullptr;
-}
-
-/// Implements the conversion of HLO op to scalar op (to use within region of a
-/// linalg.generic op) for compare-select style operations like min/max.
-template <typename... Args>
-struct CompareSelectOpToStdScalarOp {
-  static Value map(Location loc, StringRef comparison_direction,
-                   ArrayRef<Type> result_types, ArrayRef<Type> arg_types,
-                   ValueRange args, OpBuilder* b) {
-    return nullptr;
-  }
-};
-
-/// Specialization which allows converting to a comparison operation in standard
-/// dialect with a given predicate based on the element type of the operand.
-template <typename SupportedType, typename StdCompareOp, typename Predicate,
-          typename... Args>
-struct CompareSelectOpToStdScalarOp<SupportedType, StdCompareOp, Predicate,
-                                    Args...> {
-  static Value map(Location loc, StringRef comparison_direction,
-                   ArrayRef<Type> result_types, ArrayRef<Type> arg_types,
-                   ValueRange args, OpBuilder* b) {
-    Type element_type = getElementTypeOrSelf(arg_types.front());
-    if (element_type.isa<SupportedType>()) {
-      auto predicate = getCmpPredicate<Predicate>(
-          comparison_direction, !element_type.isUnsignedInteger());
-      assert(predicate.hasValue() && "expected valid comparison direction");
-      auto cmp = b->template create<StdCompareOp>(loc, predicate.getValue(),
-                                                  args[0], args[1]);
-      return b->create<::mlir::SelectOp>(loc, cmp, args[0], args[1]);
-    }
-    return CompareSelectOpToStdScalarOp<Args...>::map(
-        loc, comparison_direction, result_types, arg_types, args, b);
-  }
-};
-
-inline Value LhloAlwaysPropagateNaN(Value v, ValueRange args, Location loc,
-                                    OpBuilder* b) {
-  Type element_type = getElementTypeOrSelf(args.front().getType());
-  if (auto float_type = element_type.dyn_cast<FloatType>()) {
-    Value isnan = b->create<mlir::arith::CmpFOp>(loc, arith::CmpFPredicate::UNO,
-                                                 args[0], args[1]);
-
-    auto nan_apfloat = APFloat::getQNaN(float_type.getFloatSemantics());
-    Value nan =
-        b->create<mlir::arith::ConstantFloatOp>(loc, nan_apfloat, float_type);
-    if (VectorType vec_type = args[0].getType().dyn_cast<VectorType>()) {
-      nan = b->create<::mlir::SplatOp>(loc, vec_type, nan);
-    }
-    v = b->create<mlir::SelectOp>(loc, isnan, nan, v);
-  }
-  return v;
-}
-
-template <>
-inline Value MapLhloOpToStdScalarOp<lmhlo::LogisticOp>(
-    Location loc, ArrayRef<Type> result_types, ArrayRef<Type> arg_types,
-    ValueRange args, OpBuilder* b) {
-  auto ty = result_types.front().cast<FloatType>();
-  Value one = b->create<arith::ConstantOp>(loc, b->getFloatAttr(ty, 1.0));
-  Value x = args.front();
-  Value neg_x = b->create<arith::NegFOp>(loc, x);
-  Value exp_neg_x = b->create<::mlir::math::ExpOp>(loc, neg_x);
-  Value one_add_exp_neg_x = b->create<arith::AddFOp>(loc, one, exp_neg_x);
-  return b->create<arith::DivFOp>(loc, one, one_add_exp_neg_x);
-}
-
-template <>
-inline Value MapLhloOpToStdScalarOp<lmhlo::ClampOp>(Location loc,
-                                                    ArrayRef<Type> result_types,
-                                                    ArrayRef<Type> arg_types,
-                                                    ValueRange args,
-                                                    OpBuilder* b) {
-  assert(args.size() == 3 && "expected 3 arguments");
-  Value lb = args[0];
-  Value x = args[1];
-  Value ub = args[2];
-
-  // clamp(lb, x, ub) = max(min(x, ub), lb)
-  Value min_x_ub = MapLhloOpToStdScalarOp<lmhlo::MinOp>(loc, result_types,
-                                                        arg_types, {x, ub}, b);
-  return MapLhloOpToStdScalarOp<lmhlo::MaxOp>(loc, result_types, arg_types,
-                                              {min_x_ub, lb}, b);
-}
-
-template <>
-inline Value MapLhloOpToStdScalarOp<lmhlo::NegOp>(Location loc,
-                                                  ArrayRef<Type> result_types,
-                                                  ArrayRef<Type> arg_types,
-                                                  ValueRange args,
-                                                  OpBuilder* b) {
-  Type element_type = getElementTypeOrSelf(args.front().getType());
-  if (element_type.isa<ComplexType, FloatType>()) {
-    return MapLhloOpToScalarOpImpl<isFloatType, ::mlir::arith::NegFOp,
-                                   isComplexType, ::mlir::complex::NegOp>{}(
-        loc, result_types, arg_types, args, b);
-  }
-  if (element_type.isa<IntegerType>()) {
-    // lmhlo.neg(x, result) -> result = sub(0, x)
-    Value lhs = args[0];
-    auto integer_type = element_type.dyn_cast<IntegerType>();
-
-    Value zero_intval = b->create<::mlir::arith::ConstantIntOp>(
-        loc, 0, integer_type.getWidth());
-    if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
-      zero_intval = b->create<::mlir::SplatOp>(loc, vec_type, zero_intval);
-    }
-    return b->create<ScalarIOp<lmhlo::SubOp>>(loc, zero_intval, lhs);
-  }
-  return nullptr;
-}
-
-template <>
-inline Value MapLhloOpToStdScalarOp<lmhlo::NotOp>(Location loc,
-                                                  ArrayRef<Type> result_types,
-                                                  ArrayRef<Type> arg_types,
-                                                  ValueRange args,
-                                                  OpBuilder* b) {
-  Type element_type = getElementTypeOrSelf(args.front().getType());
-  if (auto integer_type = element_type.dyn_cast<IntegerType>()) {
-    // lmhlo.not(x) -> x ^ -1
-    Value all_ones = b->create<::mlir::arith::ConstantIntOp>(
-        loc, -1, integer_type.getWidth());
-    if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
-      all_ones = b->create<::mlir::SplatOp>(loc, vec_type, all_ones);
-    }
-    return b->create<::mlir::arith::XOrIOp>(loc, all_ones, args[0]);
-  }
-  return nullptr;
-}
-
-template <>
-inline Value MapLhloOpToStdScalarOp<lmhlo::PowOp>(Location loc,
-                                                  ArrayRef<Type> result_types,
-                                                  ArrayRef<Type> arg_types,
-                                                  ValueRange args,
-                                                  OpBuilder* b) {
-  lmhlo::PowOp::Adaptor adaptor(args);
-  auto lb = ImplicitLocOpBuilder(loc, *b);
-  // Floating point can use std::powf
-  auto result_type = result_types.front();
-  if (result_type.isa<::mlir::FloatType>())
-    return MapLhloOpToScalarOpImpl<::mlir::math::PowFOp>{}(loc, result_types,
-                                                           arg_types, args, b);
-
-  assert(result_type.isa<::mlir::IntegerType>() &&
-         "only float and integer `pow` is supported right now");
-
-  // Exponentiation by squaring:
-  // https://en.wikipedia.org/wiki/Exponentiation_by_squaring;
-  Value neg_one =
-      lb.create<arith::ConstantOp>(lb.getIntegerAttr(result_type, -1));
-  Value zero = lb.create<arith::ConstantOp>(lb.getIntegerAttr(result_type, 0));
-  Value one = lb.create<arith::ConstantOp>(lb.getIntegerAttr(result_type, 1));
-  Value two = lb.create<arith::ConstantOp>(lb.getIntegerAttr(result_type, 2));
-  Value step = lb.create<arith::ConstantIndexOp>(1);
-  Value lowerBound = lb.create<arith::ConstantIndexOp>(0);
-  // Everything else would overflow for any exponent > 1, as 2^64
-  // is the larget possible exponent for a 64-bit integer, and
-  // that's 1 << 6.
-  Value upperBound = lb.create<arith::ConstantIndexOp>(6);
-  auto original_base = adaptor.lhs();
-  auto original_exponent = adaptor.rhs();
-
-  Value accum =
-      lb.create<scf::ForOp>(
-            lowerBound, upperBound, step,
-            SmallVector<Value>({one, original_base, original_exponent}),
-            [&](OpBuilder& b, Location, Value v, ValueRange iters) {
-              Value accum = iters[0];
-              Value base = iters[1];
-              Value exponent = iters[2];
-
-              Value condition = b.create<arith::CmpIOp>(
-                  loc, arith::CmpIPredicate::eq,
-                  b.create<::mlir::arith::AndIOp>(loc, exponent, one), one);
-              Value multiplied =
-                  b.create<::mlir::arith::MulIOp>(loc, accum, base);
-              accum =
-                  b.create<::mlir::SelectOp>(loc, condition, multiplied, accum);
-              base = b.create<::mlir::arith::MulIOp>(loc, base, base);
-              exponent = b.create<::mlir::arith::ShRUIOp>(loc, exponent, one);
-              b.create<scf::YieldOp>(
-                  loc, SmallVector<Value>({accum, base, exponent}));
-            })
-          .getResult(0);
-
-  Value rhs_is_even = lb.create<arith::CmpIOp>(
-      arith::CmpIPredicate::eq, lb.create<arith::RemSIOp>(adaptor.rhs(), two),
-      zero);
-  Value rhs_is_negative =
-      lb.create<arith::CmpIOp>(arith::CmpIPredicate::slt, adaptor.rhs(), zero);
-  Value lhs_is_one =
-      lb.create<arith::CmpIOp>(arith::CmpIPredicate::eq, adaptor.lhs(), one);
-  Value lhs_is_neg_one = lb.create<arith::CmpIOp>(arith::CmpIPredicate::eq,
-                                                  adaptor.lhs(), neg_one);
-
-  // The accum is correct when the rhs is non-negative. When rhs is
-  // negative, we return 0 for integer, with the exception of lhs values of 1
-  // and -1 which have integer results for negative exponents. Specifically, the
-  // calulation is the following:
-  //
-  // - Return accum if the rhs is not negative.
-  // - Return 1 or -1 depending on the parity of rhs when the lhs is -1.
-  // - Return 1 if lhs is 1.
-  // - Else return 0.
-  Value if_lhs_is_one = lb.create<::mlir::SelectOp>(lhs_is_one, one, zero);
-  Value if_lhs_is_neg_one = lb.create<::mlir::SelectOp>(
-      lhs_is_neg_one, lb.create<::mlir::SelectOp>(rhs_is_even, one, neg_one),
-      if_lhs_is_one);
-  return lb.create<::mlir::SelectOp>(rhs_is_negative, if_lhs_is_neg_one, accum);
-}
-
-template <>
-inline Value MapLhloOpToStdScalarOp<lmhlo::SelectOp>(
-    Location loc, ArrayRef<Type> result_types, ArrayRef<Type> arg_types,
-    ValueRange args, OpBuilder* b) {
-  return MapLhloOpToScalarOpImpl<::mlir::SelectOp>{}(loc, result_types,
-                                                     arg_types, args, b);
-}
-
-template <>
-inline Value MapLhloOpToStdScalarOp<lmhlo::SignOp>(Location loc,
-                                                   ArrayRef<Type> result_types,
-                                                   ArrayRef<Type> arg_types,
-                                                   ValueRange args,
-                                                   OpBuilder* b) {
-  Type element_type = getElementTypeOrSelf(args.front().getType());
-  if (auto float_type = element_type.dyn_cast<FloatType>()) {
-    bool ignored;
-    APFloat zero_apfloat(0.0f);
-    zero_apfloat.convert(float_type.getFloatSemantics(),
-                         APFloat::rmNearestTiesToEven, &ignored);
-    Value zero =
-        b->create<mlir::arith::ConstantFloatOp>(loc, zero_apfloat, float_type);
-    if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
-      zero = b->create<::mlir::SplatOp>(loc, vec_type, zero);
-    }
-    Value ne0_i1 = b->create<::mlir::arith::CmpFOp>(
-        loc, arith::CmpFPredicate::ONE, args[0], zero);
-    Value ne0_float =
-        b->create<::mlir::arith::UIToFPOp>(loc, ne0_i1, zero.getType());
-    Value copy_sign = b->create<::mlir::math::CopySignOp>(loc, result_types,
-                                                          ne0_float, args[0]);
-    auto is_nan = b->create<::mlir::arith::CmpFOp>(
-        loc, arith::CmpFPredicate::UNO, args[0], args[0]);
-    return b->create<::mlir::SelectOp>(loc, is_nan, args[0], copy_sign);
-  } else if (auto integer_type = element_type.dyn_cast<IntegerType>()) {
-    // sign(x) = x == 0 ? 0 : ((x s>> 31) | 1)
-    Value zero = b->create<::mlir::arith::ConstantIntOp>(
-        loc, 0, integer_type.getWidth());
-    Value bitwidth_minus_one = b->create<::mlir::arith::ConstantIntOp>(
-        loc, integer_type.getWidth() - 1, integer_type.getWidth());
-    Value one = b->create<::mlir::arith::ConstantIntOp>(
-        loc, 1, integer_type.getWidth());
-    if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
-      zero = b->create<::mlir::SplatOp>(loc, vec_type, zero);
-      bitwidth_minus_one =
-          b->create<::mlir::SplatOp>(loc, vec_type, bitwidth_minus_one);
-      one = b->create<::mlir::SplatOp>(loc, vec_type, one);
-    }
-    Value cmp = b->create<::mlir::arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
-                                                 args[0], zero);
-    Value ashr =
-        b->create<::mlir::arith::ShRSIOp>(loc, args[0], bitwidth_minus_one);
-    Value or_op = b->create<::mlir::arith::OrIOp>(loc, ashr, one);
-    return b->create<::mlir::SelectOp>(loc, cmp, zero, or_op);
-  } else if (element_type.isa<ComplexType>()) {
-    return b->create<::mlir::complex::SignOp>(loc, element_type, args.front());
-  }
-  return nullptr;
-}
-
-}  // namespace impl
-
-struct HloOpToStdScalarOp {
+struct LhloOpToStdScalarOp {
   // Implementation for LHLO ops except lmhlo::CompareOp.
-  template <typename HloOpTy, typename LhloOpTy = HloOpTy,
-            typename = std::enable_if_t<
-                !std::is_same<LhloOpTy, lmhlo::CompareOp>::value &&
-                std::is_same<typename mhlo::HloToLhloOp<LhloOpTy>,
-                             std::false_type>::value>>
-  static Value map(HloOpTy op, ArrayRef<Type> result_types, ValueRange args,
-                   OpBuilder* b, unsigned i = 0) {
-    return impl::MapLhloOpToStdScalarOp<LhloOpTy>(
-        op.getLoc(), result_types, llvm::to_vector<4>(op->getOperandTypes()),
-        args, b);
-  }
-
-  // Implementation for HLO ops except mhlo::CompareOp.
-  template <typename HloOpTy, typename LhloOpTy = mhlo::HloToLhloOp<HloOpTy>,
+  template <typename LhloOpTy, typename MhloOpTy = lmhlo::LhloToHloOp<LhloOpTy>,
             typename = std::enable_if_t<
                 !std::is_same<LhloOpTy, lmhlo::CompareOp>::value &&
-                !std::is_same<LhloOpTy, std::false_type>::value>>
-  static Value map(HloOpTy op, ArrayRef<Type> result_types, ValueRange args,
+                !std::is_same<MhloOpTy, std::false_type>::value>>
+  static Value map(LhloOpTy op, ArrayRef<Type> result_types, ValueRange args,
                    OpBuilder* b, int i = 0) {
-    return impl::MapLhloOpToStdScalarOp<LhloOpTy>(
+    return mlir::mhlo::impl::MapMhloOpToStdScalarOp<MhloOpTy>(
         op.getLoc(), result_types, llvm::to_vector<4>(op->getOperandTypes()),
         args, b);
   }
@@ -863,44 +41,21 @@ struct HloOpToStdScalarOp {
   static Value map(lmhlo::CompareOp op, ArrayRef<Type> result_types,
                    ValueRange args, OpBuilder* b) {
     auto comparison_direction = op.comparison_direction();
-    return impl::MapCompareOpToStdScalarOp<lmhlo::CompareOp>(
-        op.getLoc(), comparison_direction, result_types,
-        llvm::to_vector<4>(op->getOperandTypes()), args, b);
-  }
-
-  // Implementation for mhlo::CompareOp.
-  template <typename HloOpTy,
-            typename =
-                std::enable_if_t<std::is_same<HloOpTy, mhlo::CompareOp>::value>>
-  static Value map(mhlo::CompareOp op, ArrayRef<Type> result_types,
-                   ValueRange args, OpBuilder* b) {
-    auto comparison_direction = op.comparison_direction();
-    return impl::MapCompareOpToStdScalarOp<lmhlo::CompareOp>(
+    return mlir::mhlo::impl::MapCompareOpToStdScalarOp<mhlo::CompareOp>(
         op.getLoc(), comparison_direction, result_types,
         llvm::to_vector<4>(op->getOperandTypes()), args, b);
   }
 
   // Implementation for LHLO ops except lmhlo::CompareOp.
-  template <typename LhloOpTy,
+  template <typename LhloOpTy, typename MhloOpTy = lmhlo::LhloToHloOp<LhloOpTy>,
             typename = std::enable_if_t<
                 !std::is_same<LhloOpTy, lmhlo::CompareOp>::value &&
-                std::is_same<typename mhlo::HloToLhloOp<LhloOpTy>,
-                             std::false_type>::value>>
+                !std::is_same<MhloOpTy, std::false_type>::value>>
   static Value map(Location loc, ArrayRef<Type> result_types,
                    ArrayRef<Type> arg_types, ValueRange args, OpBuilder* b,
                    unsigned i = 0) {
-    return impl::MapLhloOpToStdScalarOp<LhloOpTy>(loc, result_types, arg_types,
-                                                  args, b);
-  }
-
-  // Implementation for lmhlo::CompareOp.
-  template <typename LhloOpTy, typename = std::enable_if_t<std::is_same<
-                                   LhloOpTy, lmhlo::CompareOp>::value>>
-  static Value map(Location loc, StringRef comparison_direction,
-                   ArrayRef<Type> result_types, ArrayRef<Type> arg_types,
-                   ValueRange args, OpBuilder* b) {
-    return impl::MapCompareOpToStdScalarOp<lmhlo::CompareOp>(
-        loc, comparison_direction, result_types, arg_types, args, b);
+    return mlir::mhlo::impl::MapMhloOpToStdScalarOp<MhloOpTy>(
+        loc, result_types, arg_types, args, b);
   }
 };
 
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_mhlo_to_scalar_op.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_mhlo_to_scalar_op.h
new file mode 100644
index 00000000000000..b3ad77f85574ba
--- /dev/null
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/map_mhlo_to_scalar_op.h
@@ -0,0 +1,879 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_MHLO_TO_SCALAR_OP_H_
+#define TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_MHLO_TO_SCALAR_OP_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/iterator_range.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
+#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
+#include "mlir/Dialect/Complex/IR/Complex.h"
+#include "mlir/Dialect/Math/IR/Math.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/IR/TypeUtilities.h"
+
+namespace mlir {
+namespace mhlo {
+namespace impl {
+
+// A struct to map MhloBinaryOpTy type to the corresponding floating-point and
+// integer scalar operation types.
+template <typename MhloBinaryOpTy>
+struct MhloToScalarOp {
+  using FOp = void;
+  using IOp = void;
+  using UOp = void;
+  using COp = void;
+};
+
+template <>
+struct MhloToScalarOp<mhlo::AddOp> {
+  using FOp = ::mlir::arith::AddFOp;
+  using IOp = ::mlir::arith::AddIOp;
+  using UOp = ::mlir::arith::AddIOp;
+  using COp = ::mlir::complex::AddOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::AndOp> {
+  using IOp = ::mlir::arith::AndIOp;
+  using UOp = ::mlir::arith::AndIOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::CompareOp> {
+  using FOp = ::mlir::arith::CmpFOp;
+  using IOp = ::mlir::arith::CmpIOp;
+  using UOp = ::mlir::arith::CmpIOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::CeilOp> {
+  using FOp = ::mlir::math::CeilOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::CosOp> {
+  using FOp = ::mlir::math::CosOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::DivOp> {
+  using FOp = ::mlir::arith::DivFOp;
+  using IOp = ::mlir::arith::DivSIOp;
+  using UOp = ::mlir::arith::DivUIOp;
+  using COp = ::mlir::complex::DivOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::ExpOp> {
+  using FOp = ::mlir::math::ExpOp;
+  using COp = ::mlir::complex::ExpOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::Expm1Op> {
+  using FOp = ::mlir::math::ExpM1Op;
+};
+template <>
+struct MhloToScalarOp<mhlo::FloorOp> {
+  using FOp = ::mlir::math::FloorOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::MaxOp> {
+  using FOp = ::mlir::arith::MaxFOp;
+  using IOp = ::mlir::arith::MaxSIOp;
+  using UOp = ::mlir::arith::MaxUIOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::MinOp> {
+  using FOp = ::mlir::arith::MinFOp;
+  using IOp = ::mlir::arith::MinSIOp;
+  using UOp = ::mlir::arith::MinUIOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::LogOp> {
+  using FOp = ::mlir::math::LogOp;
+  using COp = ::mlir::complex::LogOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::Log1pOp> {
+  using FOp = ::mlir::math::Log1pOp;
+  using COp = ::mlir::complex::Log1pOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::MulOp> {
+  using FOp = ::mlir::arith::MulFOp;
+  using IOp = ::mlir::arith::MulIOp;
+  using UOp = ::mlir::arith::MulIOp;
+  using COp = ::mlir::complex::MulOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::OrOp> {
+  using IOp = ::mlir::arith::OrIOp;
+  using UOp = ::mlir::arith::OrIOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::RemOp> {
+  using FOp = ::mlir::arith::RemFOp;
+  using IOp = ::mlir::arith::RemSIOp;
+  using UOp = ::mlir::arith::RemUIOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::RsqrtOp> {
+  using FOp = ::mlir::math::RsqrtOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::SubOp> {
+  using FOp = ::mlir::arith::SubFOp;
+  using IOp = ::mlir::arith::SubIOp;
+  using UOp = ::mlir::arith::SubIOp;
+  using COp = ::mlir::complex::SubOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::SqrtOp> {
+  using FOp = ::mlir::math::SqrtOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::SinOp> {
+  using FOp = ::mlir::math::SinOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::ShiftLeftOp> {
+  using IOp = ::mlir::arith::ShLIOp;
+  using UOp = ::mlir::arith::ShLIOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::ShiftRightArithmeticOp> {
+  using IOp = ::mlir::arith::ShRSIOp;
+  using UOp = ::mlir::arith::ShRSIOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::ShiftRightLogicalOp> {
+  using IOp = ::mlir::arith::ShRUIOp;
+  using UOp = ::mlir::arith::ShRUIOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::Atan2Op> {
+  using FOp = ::mlir::math::Atan2Op;
+};
+template <>
+struct MhloToScalarOp<mhlo::TanhOp> {
+  using FOp = ::mlir::math::TanhOp;
+};
+template <>
+struct MhloToScalarOp<mhlo::XorOp> {
+  using IOp = ::mlir::arith::XOrIOp;
+  using UOp = ::mlir::arith::XOrIOp;
+};
+
+// Alias for the map from MHLO binary op type to STD floating-point op type.
+template <typename MhloOp>
+using ScalarFOp = typename MhloToScalarOp<MhloOp>::FOp;
+// Alias for the map from MHLO binary op type to STD signed integer op type.
+template <typename MhloOp>
+using ScalarIOp = typename MhloToScalarOp<MhloOp>::IOp;
+// Alias for the map from MHLO binary op type to STD unsigned integer op type.
+template <typename MhloOp>
+using ScalarUOp = typename MhloToScalarOp<MhloOp>::UOp;
+// Alias for the map from MHLO binary op type to STD complex op type.
+template <typename MhloOp>
+using ScalarCOp = typename MhloToScalarOp<MhloOp>::COp;
+
+template <typename... Args>
+struct MapMhloOpToScalarOpImpl {
+  Value operator()(Location loc, ArrayRef<Type> result_types,
+                   ArrayRef<Type> arg_types, ValueRange args, OpBuilder* b) {
+    return nullptr;
+  }
+};
+
+template <typename StdScalarOp>
+struct MapMhloOpToScalarOpImpl<StdScalarOp> {
+  Value operator()(Location loc, ArrayRef<Type> result_types,
+                   ArrayRef<Type> arg_types, ValueRange args, OpBuilder* b) {
+    return b->template create<StdScalarOp>(loc, result_types, args, mlir::None);
+  }
+};
+
+template <typename SupportedType, typename StdScalarOp, typename... Args>
+struct MapMhloOpToScalarOpImpl<SupportedType, StdScalarOp, Args...> {
+  Value operator()(Location loc, ArrayRef<Type> result_types,
+                   ArrayRef<Type> arg_types, ValueRange args, OpBuilder* b) {
+    Type element_type = getElementTypeOrSelf(arg_types.front());
+    if (SupportedType{}(element_type)) {
+      return b->template create<StdScalarOp>(loc, result_types, args,
+                                             mlir::None);
+    }
+    return MapMhloOpToScalarOpImpl<Args...>{}(loc, result_types, arg_types,
+                                              args, b);
+  }
+};
+
+template <typename SupportedType, typename... Args>
+struct MapMhloOpToScalarOpImpl<SupportedType, void, Args...> {
+  Value operator()(Location loc, ArrayRef<Type> result_types,
+                   ArrayRef<Type> arg_types, ValueRange args, OpBuilder* b) {
+    return MapMhloOpToScalarOpImpl<Args...>{}(loc, result_types, arg_types,
+                                              args, b);
+  }
+};
+
+struct isAnyIntegerType {
+  bool operator()(Type t) { return t.isa<IntegerType>(); }
+};
+
+struct isSignedIntegerType {
+  bool operator()(Type t) {
+    // Pretend that signless is signed. This will change eventually.
+    return t.isa<IntegerType>() && !t.isUnsignedInteger();
+  }
+};
+
+struct isUnsignedIntegerType {
+  bool operator()(Type t) { return t.isUnsignedInteger(); }
+};
+
+struct isFloatType {
+  bool operator()(Type t) { return t.isa<FloatType>(); }
+};
+
+struct isComplexType {
+  bool operator()(Type t) { return t.isa<ComplexType>(); }
+};
+
+template <template <typename T> class MapTy, typename OpTy,
+          typename PredTy = llvm::is_detected<MapTy, OpTy>>
+struct MapableIf {
+  using type = void;
+};
+template <template <typename T> class MapTy, typename OpTy>
+struct MapableIf<MapTy, OpTy, std::true_type> {
+  using type = MapTy<OpTy>;
+};
+
+// Inserts the computation that corresponds to the body of the loop for lowered
+// MHLO unary/binary op. Returns the value for the result.
+template <typename MhloOpTy>
+inline Value MapMhloOpToStdScalarOp(Location loc, ArrayRef<Type> result_types,
+                                    ArrayRef<Type> arg_types, ValueRange args,
+                                    OpBuilder* b) {
+  using ScalarIOpOrVoid = typename MapableIf<ScalarIOp, MhloOpTy>::type;
+  using ScalarUOpOrVoid = typename MapableIf<ScalarUOp, MhloOpTy>::type;
+  using ScalarFOpOrVoid = typename MapableIf<ScalarFOp, MhloOpTy>::type;
+  using ScalarCOpOrVoid = typename MapableIf<ScalarCOp, MhloOpTy>::type;
+  return MapMhloOpToScalarOpImpl<isSignedIntegerType, ScalarIOpOrVoid,
+                                 isUnsignedIntegerType, ScalarUOpOrVoid,
+                                 isFloatType, ScalarFOpOrVoid, isComplexType,
+                                 ScalarCOpOrVoid>{}(loc, result_types,
+                                                    arg_types, args, b);
+}
+
+template <>
+inline Value MapMhloOpToStdScalarOp<mhlo::AbsOp>(Location loc,
+                                                 ArrayRef<Type> result_types,
+                                                 ArrayRef<Type> arg_types,
+                                                 ValueRange args,
+                                                 OpBuilder* b) {
+  Type element_type = getElementTypeOrSelf(arg_types.front());
+  if (element_type.isa<FloatType>()) {
+    return MapMhloOpToScalarOpImpl<isFloatType, ::mlir::math::AbsOp>{}(
+        loc, result_types, arg_types, args, b);
+  }
+  if (element_type.isa<ComplexType>()) {
+    return MapMhloOpToScalarOpImpl<isComplexType, ::mlir::complex::AbsOp>{}(
+        loc, result_types, arg_types, args, b);
+  }
+  if (element_type.isSignlessInteger() || element_type.isSignedInteger()) {
+    // lmhlo.abs(x, result) ->  result = select((x > 0), x, sub(0, x))
+    Value lhs = args[0];
+    auto integer_type = element_type.dyn_cast<IntegerType>();
+
+    Value zero_intval = b->create<::mlir::arith::ConstantIntOp>(
+        loc, 0, integer_type.getWidth());
+    if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
+      zero_intval = b->create<::mlir::SplatOp>(loc, vec_type, zero_intval);
+    }
+    auto lhs_gt_zero = b->create<ScalarIOp<CompareOp>>(
+        loc, arith::CmpIPredicate::sge, lhs, zero_intval);
+    auto neg_val = b->create<ScalarIOp<mhlo::SubOp>>(loc, zero_intval, lhs);
+    return b->create<::mlir::SelectOp>(loc, lhs_gt_zero, lhs, neg_val);
+  }
+  return nullptr;
+}
+
+template <typename PredicateType>
+inline Optional<PredicateType> getCmpPredicate(StringRef, bool) {
+  return llvm::None;
+}
+
+template <>
+inline Optional<arith::CmpFPredicate> getCmpPredicate<arith::CmpFPredicate>(
+    StringRef comparison_direction, bool is_signed) {
+  assert(is_signed && "cannot have an unsigned float!");
+  return llvm::StringSwitch<Optional<arith::CmpFPredicate>>(
+             comparison_direction)
+      .Case("EQ", arith::CmpFPredicate::OEQ)
+      .Case("NE", arith::CmpFPredicate::UNE)
+      .Case("GE", arith::CmpFPredicate::OGE)
+      .Case("GT", arith::CmpFPredicate::OGT)
+      .Case("LE", arith::CmpFPredicate::OLE)
+      .Case("LT", arith::CmpFPredicate::OLT)
+      .Default(llvm::None);
+}
+
+template <>
+inline Optional<arith::CmpIPredicate> getCmpPredicate<arith::CmpIPredicate>(
+    StringRef comparison_direction, bool is_signed) {
+  return llvm::StringSwitch<Optional<arith::CmpIPredicate>>(
+             comparison_direction)
+      .Case("EQ", arith::CmpIPredicate::eq)
+      .Case("NE", arith::CmpIPredicate::ne)
+      .Case("GE",
+            is_signed ? arith::CmpIPredicate::sge : arith::CmpIPredicate::uge)
+      .Case("GT",
+            is_signed ? arith::CmpIPredicate::sgt : arith::CmpIPredicate::ugt)
+      .Case("LE",
+            is_signed ? arith::CmpIPredicate::sle : arith::CmpIPredicate::ule)
+      .Case("LT",
+            is_signed ? arith::CmpIPredicate::slt : arith::CmpIPredicate::ult)
+      .Default(llvm::None);
+}
+
+template <typename CompareOpTy>
+inline Value MapCompareOpToStdScalarOp(Location loc,
+                                       StringRef comparison_direction,
+                                       ArrayRef<Type> result_types,
+                                       ArrayRef<Type> arg_types,
+                                       ValueRange args, OpBuilder* b) {
+  const auto& lhs = args[0];
+  const auto& rhs = args[1];
+  Type element_type = getElementTypeOrSelf(arg_types.front());
+  if (element_type.isa<IntegerType>()) {
+    Optional<arith::CmpIPredicate> predicate =
+        getCmpPredicate<arith::CmpIPredicate>(
+            comparison_direction, !element_type.isUnsignedInteger());
+    assert(predicate.hasValue() && "expected valid comparison direction");
+    return b->create<ScalarIOp<CompareOpTy>>(loc, predicate.getValue(), lhs,
+                                             rhs);
+  }
+  if (element_type.isa<FloatType>()) {
+    Optional<arith::CmpFPredicate> predicate =
+        getCmpPredicate<arith::CmpFPredicate>(comparison_direction,
+                                              /*is_signed=*/true);
+    assert(predicate.hasValue() && "expected valid comparison direction");
+    return b->create<ScalarFOp<CompareOpTy>>(loc, predicate.getValue(), lhs,
+                                             rhs);
+  }
+  if (auto complex_type = element_type.dyn_cast<ComplexType>()) {
+    if (complex_type.getElementType().isa<FloatType>()) {
+      if (comparison_direction == "EQ") {
+        return b->create<complex::EqualOp>(loc, lhs, rhs);
+      }
+      if (comparison_direction == "NE") {
+        return b->create<complex::NotEqualOp>(loc, lhs, rhs);
+      }
+    }
+  }
+  return nullptr;
+}
+
+template <>
+inline Value MapMhloOpToStdScalarOp<mhlo::CopyOp>(Location loc,
+                                                  ArrayRef<Type> result_types,
+                                                  ArrayRef<Type> arg_types,
+                                                  ValueRange args,
+                                                  OpBuilder* b) {
+  return args.front();
+}
+
+template <>
+inline Value MapMhloOpToStdScalarOp<mhlo::ComplexOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Type> arg_types,
+    ValueRange args, OpBuilder* b) {
+  return MapMhloOpToScalarOpImpl<complex::CreateOp>{}(loc, result_types,
+                                                      arg_types, args, b);
+}
+
+template <>
+inline Value MapMhloOpToStdScalarOp<mhlo::RealOp>(Location loc,
+                                                  ArrayRef<Type> result_types,
+                                                  ArrayRef<Type> arg_types,
+                                                  ValueRange args,
+                                                  OpBuilder* b) {
+  return MapMhloOpToScalarOpImpl<complex::ReOp>{}(loc, result_types, arg_types,
+                                                  args, b);
+}
+
+template <>
+inline Value MapMhloOpToStdScalarOp<mhlo::ImagOp>(Location loc,
+                                                  ArrayRef<Type> result_types,
+                                                  ArrayRef<Type> arg_types,
+                                                  ValueRange args,
+                                                  OpBuilder* b) {
+  return MapMhloOpToScalarOpImpl<complex::ImOp>{}(loc, result_types, arg_types,
+                                                  args, b);
+}
+
+template <>
+inline Value MapMhloOpToStdScalarOp<mhlo::ConvertOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Type> arg_types,
+    ValueRange args, OpBuilder* b) {
+  Type sourceType = getElementTypeOrSelf(arg_types.front());
+  Type targetType = getElementTypeOrSelf(result_types.front());
+  Type convertedSourceType = getElementTypeOrSelf(args.front());
+
+  // A boolean value is considered to be unsigned when converting to
+  // floating-point. Otherwise, it will become `-1`.
+  if ((sourceType.isInteger(/*width=*/1) || sourceType.isUnsignedInteger()) &&
+      mlir::arith::UIToFPOp::areCastCompatible(convertedSourceType,
+                                               targetType)) {
+    return b->create<mlir::arith::UIToFPOp>(loc, result_types, args,
+                                            mlir::None);
+  } else if (mlir::arith::SIToFPOp::areCastCompatible(convertedSourceType,
+                                                      targetType)) {
+    return b->create<mlir::arith::SIToFPOp>(loc, result_types, args,
+                                            mlir::None);
+  } else if (sourceType.isa<FloatType>() && targetType.isa<FloatType>()) {
+    FloatType src = sourceType.cast<FloatType>();
+    FloatType res = targetType.cast<FloatType>();
+    if (src.getWidth() > res.getWidth()) {
+      return b->create<mlir::arith::TruncFOp>(loc, result_types, args,
+                                              mlir::None);
+    } else if (src.getWidth() < res.getWidth()) {
+      return b->create<mlir::arith::ExtFOp>(loc, result_types, args,
+                                            mlir::None);
+    }
+    // No conversion is needed for the same width floats
+    return args.front();
+  }
+  if (targetType.isInteger(/*width=*/1)) {
+    // When casting to bool, we need to compare whether the value is equal to
+    // zero.
+    if (sourceType.isSignlessInteger() || sourceType.isUnsignedInteger()) {
+      Value zero_intval = b->create<::mlir::arith::ConstantIntOp>(
+          loc, 0, sourceType.cast<IntegerType>().getWidth());
+      if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
+        zero_intval = b->create<::mlir::SplatOp>(loc, vec_type, zero_intval);
+      }
+      return b->create<mlir::arith::CmpIOp>(loc, arith::CmpIPredicate::ne,
+                                            args.front(), zero_intval);
+    } else if (sourceType.isa<FloatType>()) {
+      Value zero =
+          b->create<arith::ConstantOp>(loc, b->getFloatAttr(sourceType, 0.0));
+      if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
+        zero = b->create<::mlir::SplatOp>(loc, vec_type, zero);
+      }
+      return b->create<mlir::arith::CmpFOp>(loc, arith::CmpFPredicate::UNE,
+                                            args.front(), zero);
+    }
+  }
+  if (sourceType.isa<IntegerType>() && targetType.isa<IntegerType>()) {
+    IntegerType src = sourceType.cast<IntegerType>();
+    IntegerType res = targetType.cast<IntegerType>();
+    if (src.getWidth() > res.getWidth()) {
+      return b->create<mlir::arith::TruncIOp>(loc, result_types, args,
+                                              mlir::None);
+    } else if (src.getWidth() < res.getWidth()) {
+      // Special case boolean values, so they get casted to `1` instead of `-1`.
+      if (src.isUnsignedInteger() || src.getWidth() == 1) {
+        return b->create<mlir::arith::ExtUIOp>(loc, result_types, args,
+                                               mlir::None);
+      }
+      return b->create<mlir::arith::ExtSIOp>(loc, result_types, args,
+                                             mlir::None);
+    }
+    // No conversion is needed for the same width integers
+    return args.front();
+  }
+  if (mlir::arith::FPToSIOp::areCastCompatible(convertedSourceType,
+                                               targetType)) {
+    return b->create<mlir::arith::FPToSIOp>(loc, result_types, args,
+                                            mlir::None);
+  }
+  return nullptr;
+}
+
+template <>
+inline Value MapMhloOpToStdScalarOp<mhlo::BitcastConvertOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Type>, ValueRange args,
+    OpBuilder* b) {
+  return b->create<mlir::arith::BitcastOp>(loc, result_types, args);
+}
+
+template <>
+inline Value MapMhloOpToStdScalarOp<mhlo::DotOp>(Location loc,
+                                                 ArrayRef<Type> result_types,
+                                                 ArrayRef<Type> arg_types,
+                                                 ValueRange args,
+                                                 OpBuilder* b) {
+  // Dot Op converter from lhlo to affine only accepts float and integer types.
+  const auto& lhs = args[0];
+  const auto& rhs = args[1];
+  const auto& result = args[2];
+  Type element_type = lhs.getType();
+  if (element_type.isa<FloatType>()) {
+    Value float_mul =
+        MapMhloOpToScalarOpImpl<isFloatType, ::mlir::arith::MulFOp>{}(
+            loc, result_types, arg_types, {lhs, rhs}, b);
+    return MapMhloOpToScalarOpImpl<isFloatType, ::mlir::arith::AddFOp>{}(
+        loc, result_types, arg_types, {float_mul, result}, b);
+  }
+  if (element_type.isa<IntegerType>()) {
+    Value int_mul =
+        MapMhloOpToScalarOpImpl<isAnyIntegerType, ::mlir::arith::MulIOp>{}(
+            loc, result_types, arg_types, {lhs, rhs}, b);
+    return MapMhloOpToScalarOpImpl<isAnyIntegerType, ::mlir::arith::AddIOp>{}(
+        loc, result_types, arg_types, {int_mul, result}, b);
+  }
+  return nullptr;
+}
+
+template <>
+inline Value MapMhloOpToStdScalarOp<mhlo::IsFiniteOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Type> arg_types,
+    ValueRange args, OpBuilder* b) {
+  if (args[0].getType().isa<FloatType>()) {
+    auto pos_inf = APFloat::getInf(
+        args[0].getType().cast<FloatType>().getFloatSemantics());
+    auto const_pos_inf = b->create<arith::ConstantOp>(
+        loc, b->getFloatAttr(args[0].getType(), pos_inf));
+    Value abs_x = b->create<::mlir::math::AbsOp>(loc, args[0]);
+    return b->create<::mlir::arith::CmpFOp>(loc, arith::CmpFPredicate::ONE,
+                                            abs_x, const_pos_inf);
+  }
+  return nullptr;
+}
+
+/// Implements the conversion of HLO op to scalar op (to use within region of a
+/// linalg.generic op) for compare-select style operations like min/max.
+template <typename... Args>
+struct CompareSelectOpToStdScalarOp {
+  static Value map(Location loc, StringRef comparison_direction,
+                   ArrayRef<Type> result_types, ArrayRef<Type> arg_types,
+                   ValueRange args, OpBuilder* b) {
+    return nullptr;
+  }
+};
+
+/// Specialization which allows converting to a comparison operation in standard
+/// dialect with a given predicate based on the element type of the operand.
+template <typename SupportedType, typename StdCompareOp, typename Predicate,
+          typename... Args>
+struct CompareSelectOpToStdScalarOp<SupportedType, StdCompareOp, Predicate,
+                                    Args...> {
+  static Value map(Location loc, StringRef comparison_direction,
+                   ArrayRef<Type> result_types, ArrayRef<Type> arg_types,
+                   ValueRange args, OpBuilder* b) {
+    Type element_type = getElementTypeOrSelf(arg_types.front());
+    if (element_type.isa<SupportedType>()) {
+      auto predicate = getCmpPredicate<Predicate>(
+          comparison_direction, !element_type.isUnsignedInteger());
+      assert(predicate.hasValue() && "expected valid comparison direction");
+      auto cmp = b->template create<StdCompareOp>(loc, predicate.getValue(),
+                                                  args[0], args[1]);
+      return b->create<::mlir::SelectOp>(loc, cmp, args[0], args[1]);
+    }
+    return CompareSelectOpToStdScalarOp<Args...>::map(
+        loc, comparison_direction, result_types, arg_types, args, b);
+  }
+};
+
+inline Value MhloAlwaysPropagateNaN(Value v, ValueRange args, Location loc,
+                                    OpBuilder* b) {
+  Type element_type = getElementTypeOrSelf(args.front().getType());
+  if (auto float_type = element_type.dyn_cast<FloatType>()) {
+    Value isnan = b->create<mlir::arith::CmpFOp>(loc, arith::CmpFPredicate::UNO,
+                                                 args[0], args[1]);
+
+    auto nan_apfloat = APFloat::getQNaN(float_type.getFloatSemantics());
+    Value nan =
+        b->create<mlir::arith::ConstantFloatOp>(loc, nan_apfloat, float_type);
+    if (VectorType vec_type = args[0].getType().dyn_cast<VectorType>()) {
+      nan = b->create<::mlir::SplatOp>(loc, vec_type, nan);
+    }
+    v = b->create<mlir::SelectOp>(loc, isnan, nan, v);
+  }
+  return v;
+}
+
+template <>
+inline Value MapMhloOpToStdScalarOp<mhlo::LogisticOp>(
+    Location loc, ArrayRef<Type> result_types, ArrayRef<Type> arg_types,
+    ValueRange args, OpBuilder* b) {
+  auto ty = result_types.front().cast<FloatType>();
+  Value one = b->create<arith::ConstantOp>(loc, b->getFloatAttr(ty, 1.0));
+  Value x = args.front();
+  Value neg_x = b->create<arith::NegFOp>(loc, x);
+  Value exp_neg_x = b->create<::mlir::math::ExpOp>(loc, neg_x);
+  Value one_add_exp_neg_x = b->create<arith::AddFOp>(loc, one, exp_neg_x);
+  return b->create<arith::DivFOp>(loc, one, one_add_exp_neg_x);
+}
+
+template <>
+inline Value MapMhloOpToStdScalarOp<mhlo::ClampOp>(Location loc,
+                                                   ArrayRef<Type> result_types,
+                                                   ArrayRef<Type> arg_types,
+                                                   ValueRange args,
+                                                   OpBuilder* b) {
+  assert(args.size() == 3 && "expected 3 arguments");
+  Value lb = args[0];
+  Value x = args[1];
+  Value ub = args[2];
+
+  // clamp(lb, x, ub) = max(min(x, ub), lb)
+  Value min_x_ub = MapMhloOpToStdScalarOp<mhlo::MinOp>(loc, result_types,
+                                                       arg_types, {x, ub}, b);
+  return MapMhloOpToStdScalarOp<mhlo::MaxOp>(loc, result_types, arg_types,
+                                             {min_x_ub, lb}, b);
+}
+
+template <>
+inline Value MapMhloOpToStdScalarOp<mhlo::NegOp>(Location loc,
+                                                 ArrayRef<Type> result_types,
+                                                 ArrayRef<Type> arg_types,
+                                                 ValueRange args,
+                                                 OpBuilder* b) {
+  Type element_type = getElementTypeOrSelf(args.front().getType());
+  if (element_type.isa<ComplexType, FloatType>()) {
+    return MapMhloOpToScalarOpImpl<isFloatType, ::mlir::arith::NegFOp,
+                                   isComplexType, ::mlir::complex::NegOp>{}(
+        loc, result_types, arg_types, args, b);
+  }
+  if (element_type.isa<IntegerType>()) {
+    // lmhlo.neg(x, result) -> result = sub(0, x)
+    Value lhs = args[0];
+    auto integer_type = element_type.dyn_cast<IntegerType>();
+
+    Value zero_intval = b->create<::mlir::arith::ConstantIntOp>(
+        loc, 0, integer_type.getWidth());
+    if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
+      zero_intval = b->create<::mlir::SplatOp>(loc, vec_type, zero_intval);
+    }
+    return b->create<ScalarIOp<mhlo::SubOp>>(loc, zero_intval, lhs);
+  }
+  return nullptr;
+}
+
+template <>
+inline Value MapMhloOpToStdScalarOp<mhlo::NotOp>(Location loc,
+                                                 ArrayRef<Type> result_types,
+                                                 ArrayRef<Type> arg_types,
+                                                 ValueRange args,
+                                                 OpBuilder* b) {
+  Type element_type = getElementTypeOrSelf(args.front().getType());
+  if (auto integer_type = element_type.dyn_cast<IntegerType>()) {
+    // lmhlo.not(x) -> x ^ -1
+    Value all_ones = b->create<::mlir::arith::ConstantIntOp>(
+        loc, -1, integer_type.getWidth());
+    if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
+      all_ones = b->create<::mlir::SplatOp>(loc, vec_type, all_ones);
+    }
+    return b->create<::mlir::arith::XOrIOp>(loc, all_ones, args[0]);
+  }
+  return nullptr;
+}
+
+template <>
+inline Value MapMhloOpToStdScalarOp<mhlo::PowOp>(Location loc,
+                                                 ArrayRef<Type> result_types,
+                                                 ArrayRef<Type> arg_types,
+                                                 ValueRange args,
+                                                 OpBuilder* b) {
+  mhlo::PowOp::Adaptor adaptor(args);
+  auto lb = ImplicitLocOpBuilder(loc, *b);
+  // Floating point can use std::powf
+  auto result_type = result_types.front();
+  if (result_type.isa<::mlir::FloatType>())
+    return MapMhloOpToScalarOpImpl<::mlir::math::PowFOp>{}(loc, result_types,
+                                                           arg_types, args, b);
+
+  assert(result_type.isa<::mlir::IntegerType>() &&
+         "only float and integer `pow` is supported right now");
+
+  // Exponentiation by squaring:
+  // https://en.wikipedia.org/wiki/Exponentiation_by_squaring;
+  Value neg_one =
+      lb.create<arith::ConstantOp>(lb.getIntegerAttr(result_type, -1));
+  Value zero = lb.create<arith::ConstantOp>(lb.getIntegerAttr(result_type, 0));
+  Value one = lb.create<arith::ConstantOp>(lb.getIntegerAttr(result_type, 1));
+  Value two = lb.create<arith::ConstantOp>(lb.getIntegerAttr(result_type, 2));
+  Value step = lb.create<arith::ConstantIndexOp>(1);
+  Value lowerBound = lb.create<arith::ConstantIndexOp>(0);
+  // Everything else would overflow for any exponent > 1, as 2^64
+  // is the larget possible exponent for a 64-bit integer, and
+  // that's 1 << 6.
+  Value upperBound = lb.create<arith::ConstantIndexOp>(6);
+  auto original_base = adaptor.lhs();
+  auto original_exponent = adaptor.rhs();
+
+  Value accum =
+      lb.create<scf::ForOp>(
+            lowerBound, upperBound, step,
+            SmallVector<Value>({one, original_base, original_exponent}),
+            [&](OpBuilder& b, Location, Value v, ValueRange iters) {
+              Value accum = iters[0];
+              Value base = iters[1];
+              Value exponent = iters[2];
+
+              Value condition = b.create<arith::CmpIOp>(
+                  loc, arith::CmpIPredicate::eq,
+                  b.create<::mlir::arith::AndIOp>(loc, exponent, one), one);
+              Value multiplied =
+                  b.create<::mlir::arith::MulIOp>(loc, accum, base);
+              accum =
+                  b.create<::mlir::SelectOp>(loc, condition, multiplied, accum);
+              base = b.create<::mlir::arith::MulIOp>(loc, base, base);
+              exponent = b.create<::mlir::arith::ShRUIOp>(loc, exponent, one);
+              b.create<scf::YieldOp>(
+                  loc, SmallVector<Value>({accum, base, exponent}));
+            })
+          .getResult(0);
+
+  Value rhs_is_even = lb.create<arith::CmpIOp>(
+      arith::CmpIPredicate::eq, lb.create<arith::RemSIOp>(adaptor.rhs(), two),
+      zero);
+  Value rhs_is_negative =
+      lb.create<arith::CmpIOp>(arith::CmpIPredicate::slt, adaptor.rhs(), zero);
+  Value lhs_is_one =
+      lb.create<arith::CmpIOp>(arith::CmpIPredicate::eq, adaptor.lhs(), one);
+  Value lhs_is_neg_one = lb.create<arith::CmpIOp>(arith::CmpIPredicate::eq,
+                                                  adaptor.lhs(), neg_one);
+
+  // The accum is correct when the rhs is non-negative. When rhs is
+  // negative, we return 0 for integer, with the exception of lhs values of 1
+  // and -1 which have integer results for negative exponents. Specifically, the
+  // calulation is the following:
+  //
+  // - Return accum if the rhs is not negative.
+  // - Return 1 or -1 depending on the parity of rhs when the lhs is -1.
+  // - Return 1 if lhs is 1.
+  // - Else return 0.
+  Value if_lhs_is_one = lb.create<::mlir::SelectOp>(lhs_is_one, one, zero);
+  Value if_lhs_is_neg_one = lb.create<::mlir::SelectOp>(
+      lhs_is_neg_one, lb.create<::mlir::SelectOp>(rhs_is_even, one, neg_one),
+      if_lhs_is_one);
+  return lb.create<::mlir::SelectOp>(rhs_is_negative, if_lhs_is_neg_one, accum);
+}
+
+template <>
+inline Value MapMhloOpToStdScalarOp<mhlo::SelectOp>(Location loc,
+                                                    ArrayRef<Type> result_types,
+                                                    ArrayRef<Type> arg_types,
+                                                    ValueRange args,
+                                                    OpBuilder* b) {
+  return MapMhloOpToScalarOpImpl<::mlir::SelectOp>{}(loc, result_types,
+                                                     arg_types, args, b);
+}
+
+template <>
+inline Value MapMhloOpToStdScalarOp<mhlo::SignOp>(Location loc,
+                                                  ArrayRef<Type> result_types,
+                                                  ArrayRef<Type> arg_types,
+                                                  ValueRange args,
+                                                  OpBuilder* b) {
+  Type element_type = getElementTypeOrSelf(args.front().getType());
+  if (auto float_type = element_type.dyn_cast<FloatType>()) {
+    bool ignored;
+    APFloat zero_apfloat(0.0f);
+    zero_apfloat.convert(float_type.getFloatSemantics(),
+                         APFloat::rmNearestTiesToEven, &ignored);
+    Value zero =
+        b->create<mlir::arith::ConstantFloatOp>(loc, zero_apfloat, float_type);
+    if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
+      zero = b->create<::mlir::SplatOp>(loc, vec_type, zero);
+    }
+    Value ne0_i1 = b->create<::mlir::arith::CmpFOp>(
+        loc, arith::CmpFPredicate::ONE, args[0], zero);
+    Value ne0_float =
+        b->create<::mlir::arith::UIToFPOp>(loc, ne0_i1, zero.getType());
+    Value copy_sign = b->create<::mlir::math::CopySignOp>(loc, result_types,
+                                                          ne0_float, args[0]);
+    auto is_nan = b->create<::mlir::arith::CmpFOp>(
+        loc, arith::CmpFPredicate::UNO, args[0], args[0]);
+    return b->create<::mlir::SelectOp>(loc, is_nan, args[0], copy_sign);
+  } else if (auto integer_type = element_type.dyn_cast<IntegerType>()) {
+    // sign(x) = x == 0 ? 0 : ((x s>> 31) | 1)
+    Value zero = b->create<::mlir::arith::ConstantIntOp>(
+        loc, 0, integer_type.getWidth());
+    Value bitwidth_minus_one = b->create<::mlir::arith::ConstantIntOp>(
+        loc, integer_type.getWidth() - 1, integer_type.getWidth());
+    Value one = b->create<::mlir::arith::ConstantIntOp>(
+        loc, 1, integer_type.getWidth());
+    if (VectorType vec_type = args.front().getType().dyn_cast<VectorType>()) {
+      zero = b->create<::mlir::SplatOp>(loc, vec_type, zero);
+      bitwidth_minus_one =
+          b->create<::mlir::SplatOp>(loc, vec_type, bitwidth_minus_one);
+      one = b->create<::mlir::SplatOp>(loc, vec_type, one);
+    }
+    Value cmp = b->create<::mlir::arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
+                                                 args[0], zero);
+    Value ashr =
+        b->create<::mlir::arith::ShRSIOp>(loc, args[0], bitwidth_minus_one);
+    Value or_op = b->create<::mlir::arith::OrIOp>(loc, ashr, one);
+    return b->create<::mlir::SelectOp>(loc, cmp, zero, or_op);
+  } else if (element_type.isa<ComplexType>()) {
+    return b->create<::mlir::complex::SignOp>(loc, element_type, args.front());
+  }
+  return nullptr;
+}
+
+}  // namespace impl
+
+struct MhloOpToStdScalarOp {
+  // Implementation for HLO ops except mhlo::CompareOp.
+  template <typename MhloOpTy, typename = std::enable_if_t<!std::is_same<
+                                   MhloOpTy, mhlo::CompareOp>::value>>
+  static Value map(MhloOpTy op, ArrayRef<Type> result_types, ValueRange args,
+                   OpBuilder* b) {
+    return impl::MapMhloOpToStdScalarOp<MhloOpTy>(
+        op.getLoc(), result_types, llvm::to_vector<4>(op->getOperandTypes()),
+        args, b);
+  }
+
+  // Implementation for mhlo::CompareOp.
+  template <typename MhloOpTy, typename = std::enable_if_t<std::is_same<
+                                   MhloOpTy, mhlo::CompareOp>::value>>
+  static Value map(mhlo::CompareOp op, ArrayRef<Type> result_types,
+                   ValueRange args, OpBuilder* b) {
+    auto comparison_direction = op.comparison_direction();
+    return impl::MapCompareOpToStdScalarOp<mhlo::CompareOp>(
+        op.getLoc(), comparison_direction, result_types,
+        llvm::to_vector<4>(op->getOperandTypes()), args, b);
+  }
+
+  // Implementation for HLO ops except mhlo::CompareOp.
+  template <typename MhloOpTy, typename = std::enable_if_t<!std::is_same<
+                                   MhloOpTy, mhlo::CompareOp>::value>>
+  static Value map(Location loc, ArrayRef<Type> result_types,
+                   ArrayRef<Type> arg_types, ValueRange args, OpBuilder* b) {
+    return impl::MapMhloOpToStdScalarOp<MhloOpTy>(loc, result_types, arg_types,
+                                                  args, b);
+  }
+
+  // Implementation for lmhlo::CompareOp.
+  template <typename MhloOpTy, typename = std::enable_if_t<std::is_same<
+                                   MhloOpTy, mhlo::CompareOp>::value>>
+  static Value map(Location loc, StringRef comparison_direction,
+                   ArrayRef<Type> result_types, ArrayRef<Type> arg_types,
+                   ValueRange args, OpBuilder* b) {
+    return impl::MapCompareOpToStdScalarOp<mhlo::CompareOp>(
+        loc, comparison_direction, result_types, arg_types, args, b);
+  }
+};
+
+}  // namespace mhlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_HLO_INCLUDE_MLIR_HLO_DIALECT_MHLO_TRANSFORMS_MAP_MHLO_TO_SCALAR_OP_H_
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h
index 472503a03e8173..7a91030f4fc2ea 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h
@@ -108,9 +108,6 @@ namespace lmhlo {
 // Lowers from LHLO dialect to Affine dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLhloLegalizeToAffinePass();
 
-// Lowers from LHLO dialect to Linalg dialect.
-std::unique_ptr<OperationPass<FuncOp>> createLegalizeLhloToLinalgPass();
-
 // Lowers from LHLO dialect to GPU dialect.
 std::unique_ptr<FunctionPass> createLegalizeToGpuPass();
 
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h
index 317b70b82babc7..b6f5399fd2339a 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h
@@ -21,10 +21,12 @@ limitations under the License.
 
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/PatternMatch.h"
-#include "mlir/Transforms/Bufferize.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
+namespace bufferization {
+class BufferizeTypeConverter;
+}
 namespace mhlo {
 
 class RemoveSignTypeConverter;
@@ -55,13 +57,13 @@ void PopulateMhloToStdPatterns(OwningRewritePatternList *patterns,
 // Collection of rewrite patterns for lowering all mhlo ops to their
 // lmhlo counterparts.
 void populateDynamicHLOToLHLOConversionPattern(
-    MLIRContext *context, BufferizeTypeConverter *converter,
+    MLIRContext *context, bufferization::BufferizeTypeConverter *converter,
     OwningRewritePatternList *patterns);
 
 // Collection of rewrite patterns for lowering of HLO to LHLO dialect.
-void populateHLOToLHLOConversionPattern(MLIRContext *context,
-                                        BufferizeTypeConverter *converter,
-                                        OwningRewritePatternList *patterns);
+void populateHLOToLHLOConversionPattern(
+    MLIRContext *context, bufferization::BufferizeTypeConverter *converter,
+    OwningRewritePatternList *patterns);
 
 // Collection of rewrite patterns for lowering of HLO to memref dialect.
 // These patterns generally assume that the HLO operation are aliasing their
@@ -69,8 +71,8 @@ void populateHLOToLHLOConversionPattern(MLIRContext *context,
 // inserted when the lowering would otherwise lead to a memref with a
 // non-identity map.
 void populateHLOToMemrefConversionPattern(
-    BufferizeTypeConverter *converter, RemoveSignTypeConverter *sign_converter,
-    OwningRewritePatternList *patterns,
+    bufferization::BufferizeTypeConverter *converter,
+    RemoveSignTypeConverter *sign_converter, OwningRewritePatternList *patterns,
     std::function<bool(Operation *)> enforce_identity_map = [](Operation *) {
       return true;
     });
diff --git a/tensorflow/compiler/mlir/hlo/lib/Analysis/shape_component_analysis.cc b/tensorflow/compiler/mlir/hlo/lib/Analysis/shape_component_analysis.cc
index 9e0b227c56f618..109d4d0003fa68 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Analysis/shape_component_analysis.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Analysis/shape_component_analysis.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "mlir-hlo/Analysis/shape_component_analysis.h"
 
+#include <vector>
+
+#include "llvm/ADT/STLExtras.h"
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
@@ -263,8 +266,11 @@ struct ShapeVisitor {
         ShapeOrValueInfo::getValueInfoOf(op.output_shape()));
   }
   void forwardDynamicReshapeShape(mhlo::DynamicReshapeOp op) {
+    auto ranked_ty = op.getResult().getType().cast<RankedTensorType>();
+    auto shape_dims =
+        lookup(ShapeOrValueInfo::getValueInfoOf(op.output_shape()));
     auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(op));
-    dims = lookup(ShapeOrValueInfo::getValueInfoOf(op.output_shape()));
+    dimsFromStaticShape(ranked_ty, shape_dims, &dims);
   }
   void backwardReduceShape(Value op) {
     forwards_worklist.push_back(ShapeOrValueInfo::getShapeInfoOf(op));
@@ -361,19 +367,19 @@ struct ShapeVisitor {
     forwards_worklist.push_back(ShapeOrValueInfo::getShapeInfoOf(v));
   }
   void forwardUnknownShape(Value v) {
-    auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(v));
-    auto type = v.getType().cast<RankedTensorType>();
+    auto ranked_ty = v.getType().dyn_cast<RankedTensorType>();
+    if (!ranked_ty) return;
     auto id = getAffineSymbolExpr(0, v.getContext());
-    for (size_t i = 0, e = type.getRank(); i != e; ++i) {
-      dims.emplace_back();
-      auto &dim = dims.back();
-      if (!type.isDynamicDim(i)) {
-        dim.expr = getAffineConstantExpr(type.getDimSize(i), v.getContext());
-      } else {
-        dim.symbols.push_back({ShapeOrValueInfo::getShapeInfoOf(v), i});
-        dim.expr = id;
-      }
-    }
+    auto &dims = insert(ShapeOrValueInfo::getShapeInfoOf(v));
+    return dimsFromStaticShape(
+        ranked_ty,
+        [&](size_t i) {
+          SymbolicExpr d;
+          d.symbols.push_back({ShapeOrValueInfo::getShapeInfoOf(v), i});
+          d.expr = id;
+          return d;
+        },
+        &dims);
   }
 
   // ===
@@ -386,19 +392,10 @@ struct ShapeVisitor {
     backwards_worklist.push_back(ShapeOrValueInfo::getShapeInfoOf(op.getArg()));
   }
   void forwardShapeOf(shape::ShapeOfOp op) {
-    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
-    auto type = op.getArg().getType().cast<RankedTensorType>();
+    auto ranked_ty = op.getArg().getType().cast<RankedTensorType>();
     auto arg = lookup(ShapeOrValueInfo::getShapeInfoOf(op.getArg()));
-    for (int64_t i = 0, e = type.getRank(); i != e; ++i) {
-      dims.emplace_back();
-      auto &dim = dims.back();
-      if (!type.isDynamicDim(i)) {
-        dim.expr = getAffineConstantExpr(type.getDimSize(i), op.getContext());
-      } else {
-        dim.symbols = arg[i].symbols;
-        dim.expr = arg[i].expr;
-      }
-    }
+    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
+    return dimsFromStaticShape(ranked_ty, arg, &dims);
   }
   void backwardNumElements(shape::NumElementsOp op) {
     forwards_worklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
@@ -460,15 +457,18 @@ struct ShapeVisitor {
   template <typename Op>
   void backwardBinOp(Op op) {
     forwards_worklist.push_back(ShapeOrValueInfo::getValueInfoOf(op));
-    backwards_worklist.append({ShapeOrValueInfo::getValueInfoOf(op.lhs()),
-                               ShapeOrValueInfo::getValueInfoOf(op.rhs())});
+    // TODO(jpienaar): Switch to named accessors when MHLO uses prefixed form.
+    backwards_worklist.append(
+        {ShapeOrValueInfo::getValueInfoOf(op.getOperand(0)),
+         ShapeOrValueInfo::getValueInfoOf(op.getOperand(1))});
   }
   template <typename Op, typename Combiner>
   void forwardBinOp(Op op, Combiner &&combiner) {
     auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(op));
-    auto lhs = lookup(ShapeOrValueInfo::getValueInfoOf(op.lhs()));
-    auto rhs = lookup(ShapeOrValueInfo::getValueInfoOf(op.rhs()));
-    for (int i = 0, e = dim0size(op.getType()); i != e; ++i) {
+    // TODO(jpienaar): Switch to named accessors when MHLO uses prefixed form.
+    auto lhs = lookup(ShapeOrValueInfo::getValueInfoOf(op.getOperand(0)));
+    auto rhs = lookup(ShapeOrValueInfo::getValueInfoOf(op.getOperand(1)));
+    for (int64_t i = 0, e = dim0size(op.getType()); i != e; ++i) {
       dims.emplace_back();
       auto &dim = dims.back();
       dim.symbols.append(lhs[i].symbols);
@@ -525,10 +525,10 @@ struct ShapeVisitor {
     forwards_worklist.push_back(ShapeOrValueInfo::getValueInfoOf(v));
   }
   void forwardConstant(Value v) {
-    auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(v));
     IntegerAttr intAttr;
     DenseIntElementsAttr denseAttr;
     if (matchPattern(v, m_Constant(&denseAttr))) {
+      auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(v));
       for (uint64_t i = 0, e = dim0size(v.getType()); i != e; ++i) {
         dims.emplace_back();
         auto &dim = dims.back();
@@ -536,6 +536,7 @@ struct ShapeVisitor {
             denseAttr.getValues<APInt>()[i].getSExtValue(), v.getContext());
       }
     } else if (matchPattern(v, m_Constant(&intAttr))) {
+      auto &dims = insert(ShapeOrValueInfo::getValueInfoOf(v));
       dims.emplace_back();
       auto &dim = dims.back();
       dim.expr = getAffineConstantExpr(intAttr.getInt(), v.getContext());
@@ -607,6 +608,29 @@ struct ShapeVisitor {
   // Helpers
   // ===
 
+  static void dimsFromStaticShape(
+      RankedTensorType ranked_ty,
+      llvm::function_ref<SymbolicExpr(int64_t)> fallback,
+      std::vector<SymbolicExpr> *merged_dims) {
+    auto *ctx = ranked_ty.getContext();
+    for (int64_t i = 0, e = ranked_ty.getRank(); i != e; ++i) {
+      if (ranked_ty.isDynamicDim(i)) {
+        merged_dims->push_back(fallback(i));
+      } else {
+        merged_dims->emplace_back();
+        auto &d = merged_dims->back();
+        d.expr = getAffineConstantExpr(ranked_ty.getDimSize(i), ctx);
+      }
+    }
+  }
+
+  static void dimsFromStaticShape(RankedTensorType ranked_ty,
+                                  ArrayRef<SymbolicExpr> fallback,
+                                  std::vector<SymbolicExpr> *merged_dims) {
+    return dimsFromStaticShape(
+        ranked_ty, [&](int64_t i) { return fallback[i]; }, merged_dims);
+  }
+
   // Return the size of the first dimension. Returns 1 for scalars.
   static int64_t dim0size(Type type) {
     if (auto rankedType = type.dyn_cast<RankedTensorType>())
@@ -718,15 +742,15 @@ llvm::Optional<Symbol> SymbolicExpr::singleton() const {
 
 void SymbolicExpr::dump(llvm::raw_ostream &os) const {
   expr.print(os);
-  if (!symbols.empty()) {
-    os << " with ";
-    for (auto sym : llvm::enumerate(symbols)) {
-      os << 's' << sym.index() << " = ";
-      if (!sym.value().source.isValueInfo()) os << "shapeof(";
-      sym.value().source.value().print(os);
-      if (!sym.value().source.isValueInfo()) os << ")";
-      os << '[' << sym.value().index << "]; ";
-    }
+  if (!symbols.empty()) os << " with";
+  os << "\n";
+  if (symbols.empty()) return;
+  for (auto sym : llvm::enumerate(symbols)) {
+    os.indent(4);
+    os << 's' << sym.index() << " = ";
+    if (!sym.value().source.isValueInfo()) os << "shapeof(";
+    sym.value().source.value().print(os);
+    if (!sym.value().source.isValueInfo()) os << ")";
+    os << '[' << sym.value().index << "]\n";
   }
-  os << '\n';
 }
diff --git a/tensorflow/compiler/mlir/hlo/lib/Analysis/test_shape_component_analysis.cc b/tensorflow/compiler/mlir/hlo/lib/Analysis/test_shape_component_analysis.cc
index e874e8447f5fa2..234e1c3e44a792 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Analysis/test_shape_component_analysis.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Analysis/test_shape_component_analysis.cc
@@ -21,6 +21,8 @@ limitations under the License.
 
 namespace mlir {
 
+using SymbolicExpr = ShapeComponentAnalysis::SymbolicExpr;
+
 namespace {
 
 struct TestShapeComponentAnalysisPass
@@ -30,21 +32,27 @@ struct TestShapeComponentAnalysisPass
   }
 
   void runOnFunction() override {
+    ShapeComponentAnalysis shape_component;
     llvm::outs() << "Testing : " << getFunction().getName() << '\n';
     // Analyze anything that looks like a shape tensor.
     getFunction().walk([&](Operation* op) {
-      // Only print single results that could be shape values or their elements.
-      if (op->getResultTypes().size() != 1 ||
-          !getElementTypeOrSelf(op->getResultTypes().front()).isIntOrIndex())
-        return;
-
-      ShapeComponentAnalysis shape_component;
+      // Skip ops with more than one result.
+      if (op->getNumResults() != 1) return;
       Value result = op->getResults().front();
-      auto dims = shape_component.GetValueInfo(result);
-      result.print(llvm::outs());
-      llvm::outs() << ":\n";
-      if (dims) {
-        for (const auto& d : *dims) {
+
+      // Dump shape info if any.
+      if (auto shapeInfo = shape_component.GetShapeInfo(result)) {
+        llvm::outs() << "Shape info for " << result << ":\n";
+        for (const SymbolicExpr& d : *shapeInfo) {
+          llvm::outs().indent(2);
+          d.dump(llvm::outs());
+        }
+      }
+
+      // Dump value info if any.
+      if (auto valueInfo = shape_component.GetValueInfo(result)) {
+        llvm::outs() << "Value info for " << result << ":\n";
+        for (const SymbolicExpr& d : *valueInfo) {
           llvm::outs().indent(2);
           d.dump(llvm::outs());
         }
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops.cc
index ca596257159963..51f94fc8554014 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/IR/hlo_ops.cc
@@ -4931,7 +4931,10 @@ OpFoldResult CompareOp::fold(ArrayRef<Attribute> operands) {
   if (!result_ty.hasStaticShape()) return {};
 
   auto direction = comparison_direction();
-  if (lhs() == rhs() && !getElementTypeOrSelf(lhs()).isa<FloatType>()) {
+  auto lhs_ty = getElementTypeOrSelf(lhs());
+  if (lhs() == rhs() && !lhs_ty.isa<FloatType>() &&
+      (!lhs_ty.isa<ComplexType>() ||
+       !lhs_ty.cast<ComplexType>().getElementType().isa<FloatType>())) {
     if (direction == "LE" || direction == "EQ" || direction == "GE") {
       return DenseIntElementsAttr::get(result_ty, {true});
     }
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/CMakeLists.txt b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/CMakeLists.txt
index b072aa091a82c3..ebada53faea723 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/CMakeLists.txt
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/CMakeLists.txt
@@ -155,6 +155,7 @@ add_mlir_library(MhloToStandard
   MLIRhlo_opsIncGen
   MLIRlhlo_opsIncGen
   MLIRMhloLegalizeToStandardIncGen
+  MLIRMhloPassIncGen
   MhloTypeConversion
 
   LINK_COMPONENTS
@@ -206,6 +207,7 @@ add_mlir_library(LmhloPasses
 
   DEPENDS
   MLIRlhlo_opsIncGen
+  MLIRMhloPassIncGen
 
   LINK_COMPONENTS
   Core
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc
index 27cd670d5d8515..4c702578c0dad0 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
 #include "mlir/Dialect/Shape/Transforms/Passes.h"
@@ -43,7 +44,6 @@ limitations under the License.
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/Bufferize.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 namespace mlir {
@@ -450,7 +450,7 @@ struct HloLegalizeToLhlo : public HloLegalizeToLhloPassBase<HloLegalizeToLhlo> {
     target.addDynamicallyLegalOp<mlir::bufferization::ToMemrefOp>(
         [](auto op) { return op->use_empty(); });
 
-    BufferizeTypeConverter converter;
+    bufferization::BufferizeTypeConverter converter;
     auto isMemRefType = [](Type type) { return type.isa<BaseMemRefType>(); };
     target.addDynamicallyLegalOp<FuncOp>([&](FuncOp op) {
       return converter.isSignatureLegal(op.getType()) &&
@@ -489,7 +489,7 @@ struct HloLegalizeToLhlo : public HloLegalizeToLhloPassBase<HloLegalizeToLhlo> {
 
 // Simply lowers all mhlo ops to their lmhlo counterparts.
 void populateDynamicHLOToLHLOConversionPattern(
-    MLIRContext* context, BufferizeTypeConverter* converter,
+    MLIRContext* context, bufferization::BufferizeTypeConverter* converter,
     OwningRewritePatternList* patterns) {
   // clang-format off
   patterns->insert<HloToLhloOpConverter<mhlo::DynamicBroadcastInDimOp>,
@@ -502,9 +502,9 @@ void populateDynamicHLOToLHLOConversionPattern(
   // clang-format on
 }
 
-void populateHLOToLHLOConversionPattern(MLIRContext* context,
-                                        BufferizeTypeConverter* converter,
-                                        OwningRewritePatternList* patterns) {
+void populateHLOToLHLOConversionPattern(
+    MLIRContext* context, bufferization::BufferizeTypeConverter* converter,
+    OwningRewritePatternList* patterns) {
   populateDynamicHLOToLHLOConversionPattern(context, converter, patterns);
 
   // clang-format off
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_memref.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_memref.cc
index 4daaa64f013b3e..82ca0572fff925 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_memref.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_memref.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir-hlo/Dialect/mhlo/transforms/type_conversion.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
+#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
@@ -316,7 +317,7 @@ struct HloLegalizeToMemrefPass
     OwningRewritePatternList patterns(&context);
     ConversionTarget target(context);
 
-    BufferizeTypeConverter converter;
+    bufferization::BufferizeTypeConverter converter;
     RemoveSignTypeConverter sign_converter;
 
     populateHLOToMemrefConversionPattern(&converter, &sign_converter,
@@ -337,8 +338,8 @@ struct HloLegalizeToMemrefPass
 }  // namespace
 
 void populateHLOToMemrefConversionPattern(
-    BufferizeTypeConverter* converter, RemoveSignTypeConverter* sign_converter,
-    OwningRewritePatternList* patterns,
+    bufferization::BufferizeTypeConverter* converter,
+    RemoveSignTypeConverter* sign_converter, OwningRewritePatternList* patterns,
     std::function<bool(Operation*)> enforce_identity_maps) {
   MLIRContext* context = patterns->getContext();
   patterns->insert<HloToMemrefDynamicBroadcastInDimOpConverter>(
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/input_inline_fusion_pass.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/input_inline_fusion_pass.cc
index 4bccb2f648e808..b297724a1a35db 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/input_inline_fusion_pass.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/input_inline_fusion_pass.cc
@@ -235,13 +235,13 @@ bool elemwiseFuseHelper(PatternRewriter& rewriter, Operation* user,
         rewriter.create<LoadOp>(loc, producer_operand, load_op.getIndices()));
   }
   auto inlined_result =
-      HloOpToStdScalarOp::map<LHLO_OpTy>(llvm::cast<LHLO_OpTy>(producer),
-                                         cast<LmhloOp>(producer)
-                                             .getResultBuffer()
-                                             .getType()
-                                             .cast<MemRefType>()
-                                             .getElementType(),
-                                         operand_values, &rewriter);
+      LhloOpToStdScalarOp::map<LHLO_OpTy>(llvm::cast<LHLO_OpTy>(producer),
+                                          cast<LmhloOp>(producer)
+                                              .getResultBuffer()
+                                              .getType()
+                                              .cast<MemRefType>()
+                                              .getElementType(),
+                                          operand_values, &rewriter);
 
   for (LoadOp to_be_replaced : load_ops)
     to_be_replaced.replaceAllUsesWith(inlined_result);
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
index fa409a03f88950..3961b8c804db27 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/legalize_to_linalg.cc
@@ -20,11 +20,10 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringSet.h"
-#include "mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "mlir-hlo/Dialect/mhlo/IR/hlo_ops_base_attrs.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/PassDetail.h"
-#include "mlir-hlo/Dialect/mhlo/transforms/map_lmhlo_to_scalar_op.h"
+#include "mlir-hlo/Dialect/mhlo/transforms/map_mhlo_to_scalar_op.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/rewriters.h"
 #include "mlir-hlo/Dialect/mhlo/transforms/type_conversion.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
@@ -86,25 +85,18 @@ SmallVector<StringRef, 3> GetNParallelLoopsAttrs(unsigned nParallelLoops) {
   return GetParallelAndReductionIterators(nParallelLoops, 0);
 }
 
-template <bool isLHLO = true>
-Value GetResultValue(Operation* op) {
-  return isLHLO ? op->getOperand(op->getNumOperands() - 1) : op->getResult(0);
-}
+Value GetResultValue(Operation* op) { return op->getResult(0); }
 
-template <bool isLHLO = true>
 ShapedType GetHloOpResultType(Operation* op) {
-  return GetResultValue<isLHLO>(op).getType().template cast<ShapedType>();
+  return GetResultValue(op).getType().cast<ShapedType>();
 }
 
-template <bool isLHLO = true>
 bool VerifyHloOpBufferOrTensorSemantics(Operation* op) {
   auto verify_type = [&](Value val) -> bool {
-    return (isLHLO && val.getType().isa<MemRefType>()) ||
-           (!isLHLO && val.getType().isa<RankedTensorType>());
+    return val.getType().isa<RankedTensorType>();
   };
   if (!llvm::all_of(op->getOperands(), verify_type)) return false;
-  return isLHLO ? op->getResults().empty()
-                : llvm::all_of(op->getResults(), verify_type);
+  return llvm::all_of(op->getResults(), verify_type);
 }
 
 Value GetInitTensor(OpBuilder& b, Location loc, ShapedType type,
@@ -454,8 +446,7 @@ class EinsumToLinalgConverter : public OpConversionPattern<mhlo::EinsumOp> {
     }
 
     // Find result type, if on tensors.
-    auto result_ty = this->typeConverter
-                         ->convertType(GetHloOpResultType</*isLHLO=*/false>(op))
+    auto result_ty = this->typeConverter->convertType(GetHloOpResultType(op))
                          .dyn_cast<RankedTensorType>();
 
     // Check result type compatibility.
@@ -611,7 +602,7 @@ bool EinsumToLinalgConverter::CheckBatchHasEqualRank(
   return batch_has_equal_rank;
 }
 
-template <typename OpTy, bool isLHLO = true>
+template <typename OpTy>
 class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
  public:
   using OpConversionPattern<OpTy>::OpConversionPattern;
@@ -629,8 +620,6 @@ class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
         it != adaptor.getOperands().end() ? *it : adaptor.getOperands().front();
     int64_t nloops = get_rank(max_rank_arg);
 
-    if (isLHLO && nloops == 0) return failure();
-
     // Apply only if all operands are scalar or have the same rank. Some ops,
     // like `mhlo.select`, support implicit broadcasting of scalars.
     if (!llvm::all_of(adaptor.getOperands(), [&](Value v) {
@@ -643,27 +632,22 @@ class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
 
     // Find result type, if on tensors.
     Optional<ShapedType> result_ty;
-    if (!isLHLO) {
-      result_ty = this->typeConverter->convertType(op->getResultTypes().front())
-                      .template dyn_cast<ShapedType>();
-
-      // Check result type compatibility.
-      if (!result_ty || !result_ty->hasRank() ||
-          result_ty->getRank() != nloops ||
-          !(result_ty->getElementType().isSignlessIntOrFloat() ||
-            result_ty->getElementType().isa<ComplexType>())) {
-        return rewriter.notifyMatchFailure(
-            op, "mismatched operand/result types or iterator count");
-      }
+    result_ty = this->typeConverter->convertType(op->getResultTypes().front())
+                    .template dyn_cast<ShapedType>();
+
+    // Check result type compatibility.
+    if (!result_ty || !result_ty->hasRank() || result_ty->getRank() != nloops ||
+        !(result_ty->getElementType().isSignlessIntOrFloat() ||
+          result_ty->getElementType().isa<ComplexType>())) {
+      return rewriter.notifyMatchFailure(
+          op, "mismatched operand/result types or iterator count");
     }
 
     // Find input/output values and types.
     auto loc = op.getLoc();
-    ValueRange inputs =
-        isLHLO ? adaptor.getOperands().drop_back() : adaptor.getOperands();
-    Value output = isLHLO ? adaptor.getOperands().back()
-                          : GetInitTensorFor(rewriter, loc, *result_ty, op,
-                                             adaptor.getOperands());
+    ValueRange inputs = adaptor.getOperands();
+    Value output =
+        GetInitTensorFor(rewriter, loc, *result_ty, op, adaptor.getOperands());
 
     // Create indexing maps.
     AffineMap scalar_map = AffineMap::get(nloops, 0, rewriter.getContext());
@@ -678,10 +662,8 @@ class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
         loc, result_ty ? *result_ty : TypeRange{}, inputs, output, maps,
         GetNParallelLoopsAttrs(nloops),
         [&](OpBuilder& nested_builder, Location nested_loc, ValueRange args) {
-          // TODO(ravishankarm) : For now use the method in lmhlo namespace.
-          // That method needs to be moved out of there.
           Type inner_result_ty = getElementTypeOrSelf(output);
-          Value inner_result = lmhlo::HloOpToStdScalarOp::map<OpTy>(
+          Value inner_result = mhlo::MhloOpToStdScalarOp::map<OpTy>(
               op, inner_result_ty,
               llvm::to_vector<2>(args.take_front(inputs.size())), &rewriter);
           if (inner_result == nullptr) {
@@ -697,31 +679,30 @@ class PointwiseToLinalgConverter : public OpConversionPattern<OpTy> {
   }
 };
 
-template <typename LhloOp>
-class ScalarPointwiseToStandardConverter : public OpConversionPattern<LhloOp> {
+template <typename MhloOp>
+class ScalarPointwiseToStandardConverter : public OpConversionPattern<MhloOp> {
  public:
-  using OpConversionPattern<LhloOp>::OpConversionPattern;
+  using OpConversionPattern<MhloOp>::OpConversionPattern;
 
   LogicalResult matchAndRewrite(
-      LhloOp lhlo_op, typename LhloOp::Adaptor adaptor,
+      MhloOp mhlo_op, typename MhloOp::Adaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
-    auto loc = lhlo_op.getLoc();
+    auto loc = mhlo_op.getLoc();
     auto arg_type =
-        lhlo_op.getOperand(0).getType().template dyn_cast<ShapedType>();
+        mhlo_op.getOperand(0).getType().template dyn_cast<ShapedType>();
     if (!arg_type || !arg_type.getElementType().isSignlessIntOrFloat() ||
         (arg_type.getRank() != 0)) {
       return failure();
     }
 
     // Create two loads from the input.
-    auto lhs = rewriter.create<memref::LoadOp>(loc, lhlo_op.lhs());
-    auto rhs = rewriter.create<memref::LoadOp>(loc, lhlo_op.rhs());
-    // TODO(ravishankarm) : Move this method out of lmhlo namespace.
-    Value op_result = lmhlo::HloOpToStdScalarOp::map<LhloOp>(
-        lhlo_op, arg_type.getElementType(), llvm::ArrayRef<Value>{lhs, rhs},
+    auto lhs = rewriter.create<memref::LoadOp>(loc, mhlo_op.lhs());
+    auto rhs = rewriter.create<memref::LoadOp>(loc, mhlo_op.rhs());
+    Value op_result = mhlo::MhloOpToStdScalarOp::map<MhloOp>(
+        mhlo_op, arg_type.getElementType(), llvm::ArrayRef<Value>{lhs, rhs},
         &rewriter);
-    rewriter.create<memref::StoreOp>(loc, op_result, lhlo_op.out());
-    rewriter.eraseOp(lhlo_op);
+    rewriter.create<memref::StoreOp>(loc, op_result, mhlo_op.out());
+    rewriter.eraseOp(mhlo_op);
     return success();
   }
 };
@@ -731,7 +712,7 @@ class ScalarPointwiseToStandardConverter : public OpConversionPattern<LhloOp> {
 /// transpose, some reshape, etc.). The derived classes need to provide a method
 /// `getIndexingMaps` that returns AffineMaps for the index maps of the input
 /// and the output.
-template <typename Derived, typename OpTy, bool isLHLO = true>
+template <typename Derived, typename OpTy>
 class DataMovementOpConverter : public OpConversionPattern<OpTy> {
  public:
   using OpConversionPattern<OpTy>::OpConversionPattern;
@@ -739,8 +720,8 @@ class DataMovementOpConverter : public OpConversionPattern<OpTy> {
   LogicalResult matchAndRewrite(
       OpTy op, typename OpTy::Adaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
-    if (!VerifyHloOpBufferOrTensorSemantics<isLHLO>(op)) return failure();
-    auto result_type = GetHloOpResultType<isLHLO>(op);
+    if (!VerifyHloOpBufferOrTensorSemantics(op)) return failure();
+    auto result_type = GetHloOpResultType(op);
     result_type = this->typeConverter->convertType(result_type)
                       .template cast<ShapedType>();
 
@@ -752,12 +733,12 @@ class DataMovementOpConverter : public OpConversionPattern<OpTy> {
     auto loc = op.getLoc();
     auto linalg_op = rewriter.create<linalg::GenericOp>(
         loc,
-        /*resultTensorTypes=*/isLHLO ? ArrayRef<Type>{} : result_type,
+        /*resultTensorTypes=*/result_type,
         /*inputs=*/adaptor.getOperands().front(),
         /*outputBuffers=*/
-        isLHLO ? ValueRange{adaptor.getOperands().back()}
-               : ValueRange{GetInitTensorFor(rewriter, loc, result_type, op,
-                                             adaptor.getOperands())},
+
+        ValueRange{GetInitTensorFor(rewriter, loc, result_type, op,
+                                    adaptor.getOperands())},
         indexing_maps, GetNParallelLoopsAttrs(nloops),
         [&](OpBuilder& nested_builder, Location nested_loc, ValueRange args) {
           nested_builder.create<linalg::YieldOp>(loc, *args.begin());
@@ -769,20 +750,19 @@ class DataMovementOpConverter : public OpConversionPattern<OpTy> {
 };
 
 /// Pattern to convert BroadcastOp to Linalg ops.
-template <typename OpTy, bool isLHLO = true>
+template <typename OpTy>
 class BroadcastConverter
-    : public DataMovementOpConverter<BroadcastConverter<OpTy, isLHLO>, OpTy,
-                                     isLHLO> {
+    : public DataMovementOpConverter<BroadcastConverter<OpTy>, OpTy> {
  public:
-  using DataMovementOpConverter<BroadcastConverter, OpTy,
-                                isLHLO>::DataMovementOpConverter;
+  using DataMovementOpConverter<BroadcastConverter,
+                                OpTy>::DataMovementOpConverter;
 
   static SmallVector<AffineMap, 2> getIndexingMaps(OpTy broadcast_op,
                                                    Builder* b) {
     ShapedType input_type =
         broadcast_op.operand().getType().template cast<ShapedType>();
     unsigned input_rank = input_type.getRank();
-    unsigned nloops = GetHloOpResultType<isLHLO>(broadcast_op).getRank();
+    unsigned nloops = GetHloOpResultType(broadcast_op).getRank();
 
     // BroadcastOp prepends the dimensions in the `broadcast_sizes` attribute to
     // the input's dimensions.
@@ -808,15 +788,15 @@ class BroadcastConverter
 
 class HloBroadcastInDimConverter
     : public DataMovementOpConverter<HloBroadcastInDimConverter,
-                                     mhlo::BroadcastInDimOp, false> {
+                                     mhlo::BroadcastInDimOp> {
  public:
-  using DataMovementOpConverter<HloBroadcastInDimConverter,
-                                mhlo::BroadcastInDimOp,
-                                false>::DataMovementOpConverter;
+  using DataMovementOpConverter<
+      HloBroadcastInDimConverter,
+      mhlo::BroadcastInDimOp>::DataMovementOpConverter;
 
   static SmallVector<AffineMap, 2> getIndexingMaps(
       mhlo::BroadcastInDimOp broadcast_op, Builder* b) {
-    auto result_type = GetHloOpResultType<false>(broadcast_op);
+    auto result_type = GetHloOpResultType(broadcast_op);
     auto operand_type =
         broadcast_op.operand().getType().template cast<ShapedType>();
     unsigned nloops = result_type.getRank();
@@ -907,173 +887,14 @@ class HloDynamicBroadcastInDimConverter
   }
 };
 
-class LhloBroadcastInDimConverter
-    : public OpConversionPattern<lmhlo::BroadcastInDimOp> {
- public:
-  using OpConversionPattern<lmhlo::BroadcastInDimOp>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      lmhlo::BroadcastInDimOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter& rewriter) const final {
-    auto result_type = adaptor.output().getType().cast<MemRefType>();
-    auto result_shape = result_type.getShape();
-
-    auto operand_and_dims = InsertReshapeIfNecessary(op, adaptor, rewriter);
-
-    Value operand = std::get<0>(operand_and_dims);
-    auto broadcast_dims = std::get<1>(operand_and_dims);
-
-    auto loc = op.getLoc();
-    auto nloops = result_type.getRank();
-    auto operand_type = operand.getType().cast<MemRefType>();
-
-    // For a degenerate case, i.e. broadcasting with expansion of
-    // memref<1xELEMENT_TYPE>, the operand is not passed to `linalg.generic`.
-    // Instead the value is loaded and used directly in `linalg.yield`.
-    if (operand_type.getRank() == 1 &&
-        operand_type.getDimSize(0) <
-            result_type.getDimSize(broadcast_dims.front())) {
-      Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-      Value val = rewriter.create<memref::LoadOp>(loc, operand,
-                                                  llvm::makeArrayRef({zero}));
-      rewriter.create<linalg::GenericOp>(
-          loc, /*inputs=*/ValueRange{},
-          /*outputBuffers=*/ValueRange{adaptor.output()},
-          llvm::makeArrayRef(rewriter.getMultiDimIdentityMap(nloops)),
-          GetNParallelLoopsAttrs(nloops),
-          [&](OpBuilder& nested_builder, Location nested_loc, ValueRange args) {
-            nested_builder.create<linalg::YieldOp>(loc, val);
-          },
-          PruneAttributeList(op));
-
-    } else {
-      auto indexing_maps = getIndexingMaps(op, broadcast_dims, result_shape,
-                                           operand_type, &rewriter);
-      rewriter.create<linalg::GenericOp>(
-          loc, /*inputs=*/ValueRange{operand},
-          /*outputBuffers=*/ValueRange{adaptor.output()}, indexing_maps,
-          GetNParallelLoopsAttrs(nloops),
-          [&](OpBuilder& nested_builder, Location nested_loc, ValueRange args) {
-            nested_builder.create<linalg::YieldOp>(loc, *args.begin());
-          },
-          PruneAttributeList(op));
-    }
-    rewriter.replaceOp(op, llvm::None);
-    return success();
-  }
-
-  // Inserts 'linalg.reshape' if there is a size-1 dim expansion.
-  std::pair<Value, SmallVector<int64_t, 2>> InsertReshapeIfNecessary(
-      lmhlo::BroadcastInDimOp op, OpAdaptor adaptor,
-      ConversionPatternRewriter& rewriter) const {
-    Value operand = adaptor.operand();
-    auto operand_type = adaptor.operand().getType().cast<MemRefType>();
-    auto operand_shape = operand_type.getShape();
-
-    Value result = adaptor.output();
-    auto result_type = result.getType().cast<MemRefType>();
-    auto result_shape = result_type.getShape();
-
-    SmallVector<int64_t, 2> operand_strides;
-    int64_t operand_offset;
-    if (failed(getStridesAndOffset(operand_type, operand_strides,
-                                   operand_offset))) {
-      op.emitOpError() << "Failed to get offset and strides.";
-    }
-
-    SmallVector<int64_t, 2> new_shape, new_strides, broadcast_dims;
-    SmallVector<ReassociationIndices, 4> collapsed_dims_list;
-    ReassociationIndices collapsed_dims;
-    for (const auto& item :
-         enumerate(op.broadcast_dimensions().getValues<APInt>())) {
-      size_t index = item.index();
-      int dim = item.value().getSExtValue();
-
-      collapsed_dims.push_back(index);
-
-      bool expansion_needed =
-          operand_shape[index] == 1 && result_shape[dim] != 1;
-      if (expansion_needed) {
-        continue;
-      }
-      new_shape.push_back(operand_shape[index]);
-      new_strides.push_back(operand_strides[index]);
-      broadcast_dims.push_back(dim);
-
-      collapsed_dims_list.push_back(collapsed_dims);
-      collapsed_dims.clear();
-    }
-    // If `collapsed_dims_list` is empty, then the memref has shape [1, ..., 1]
-    // and all dimensions need expansion. Such memref will be reshaped to a 1D
-    // memref with a single element. New shape and strides needs to be updated
-    // accordingly.
-    if (collapsed_dims_list.empty()) {
-      collapsed_dims_list.push_back({});
-      new_shape.push_back(1);
-      new_strides.push_back(1);
-      broadcast_dims.push_back(0);
-    }
-    for (const auto& dims : collapsed_dims) {
-      collapsed_dims_list.back().push_back(dims);
-    }
-
-    // `linalg.collapse_shape` is inserted only if necessary, i.e. when the rank
-    // can be reduced.
-    if (new_shape.size() < operand_shape.size()) {
-      auto new_memref_type = MemRefType::get(
-          new_shape, operand_type.getElementType(),
-          makeStridedLinearLayoutMap(new_strides, operand_offset,
-                                     rewriter.getContext()));
-      operand = rewriter.create<memref::CollapseShapeOp>(
-          op.getLoc(), new_memref_type, adaptor.operand(), collapsed_dims_list);
-    }
-    return std::make_pair(operand, broadcast_dims);
-  }
-
-  SmallVector<AffineMap, 2> getIndexingMaps(lmhlo::BroadcastInDimOp op,
-                                            ArrayRef<int64_t> broadcast_dims,
-                                            ArrayRef<int64_t> result_shape,
-                                            MemRefType operand_type,
-                                            Builder* b) const {
-    unsigned nloops = result_shape.size();
-
-    // The input is a scalar, i.e. this is a scalar broadcast op.
-    if (operand_type.getRank() == 0) {
-      return {AffineMap::get(nloops, /*symbolCount=*/0, b->getContext()),
-              b->getMultiDimIdentityMap(nloops)};
-    }
-
-    auto operand_shape = operand_type.getShape();
-    SmallVector<AffineExpr, 4> dim_exprs;
-    dim_exprs.reserve(nloops);
-
-    for (const auto& broadcast_dim : llvm::enumerate(broadcast_dims)) {
-      int size = broadcast_dim.value();
-      bool expansion_needed =
-          operand_shape[broadcast_dim.index()] == 1 && result_shape[size] != 1;
-      if (expansion_needed) {
-        op.emitOpError(
-            "BroadcastInDimOp lowering to Linalg does not support size-1 "
-            "dimensions expansion.");
-      }
-      dim_exprs.push_back(b->getAffineDimExpr(size));
-    }
-    return {
-        AffineMap::get(nloops, /*symbolCount=*/0, dim_exprs, b->getContext()),
-        b->getMultiDimIdentityMap(nloops)};
-  }
-};
-
-template <typename OpTy, bool isLHLO = true>
+template <typename OpTy>
 class TransposeConverter
-    : public DataMovementOpConverter<TransposeConverter<OpTy, isLHLO>, OpTy,
-                                     isLHLO> {
+    : public DataMovementOpConverter<TransposeConverter<OpTy>, OpTy> {
  public:
-  using DataMovementOpConverter<TransposeConverter<OpTy, isLHLO>, OpTy,
-                                isLHLO>::DataMovementOpConverter;
+  using DataMovementOpConverter<TransposeConverter<OpTy>,
+                                OpTy>::DataMovementOpConverter;
   static SmallVector<AffineMap, 2> getIndexingMaps(OpTy op, Builder* b) {
-    auto result_type =
-        GetHloOpResultType<isLHLO>(op).template cast<ShapedType>();
+    auto result_type = GetHloOpResultType(op).template cast<ShapedType>();
     auto nloops = result_type.getRank();
     SmallVector<AffineExpr, 2> input_exprs;
     input_exprs.resize(result_type.getRank());
@@ -1089,150 +910,68 @@ class TransposeConverter
 
 // Converts reshape ops that can be proven to be either a collapse of dimensions
 // or expansion of dimensions of the operand.
-template <typename OpTy, bool isLHLO = true>
-class ReshapeOpConverter : public OpConversionPattern<OpTy> {
+class ReshapeOpConverter : public OpConversionPattern<mhlo::ReshapeOp> {
  public:
-  using OpConversionPattern<OpTy>::OpConversionPattern;
+  using OpConversionPattern::OpConversionPattern;
 
   LogicalResult matchAndRewrite(
-      OpTy reshape_op, typename OpTy::Adaptor adaptor,
+      mhlo::ReshapeOp reshape_op, mhlo::ReshapeOp::Adaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
-    if (!VerifyHloOpBufferOrTensorSemantics<isLHLO>(reshape_op))
-      return failure();
-    ShapedType operand_type =
-        adaptor.operand().getType().template cast<ShapedType>();
-    ShapedType result_type = GetHloOpResultType<isLHLO>(reshape_op);
+    if (!VerifyHloOpBufferOrTensorSemantics(reshape_op)) return failure();
+    auto operand_type = adaptor.operand().getType().cast<ShapedType>();
+    auto result_type = reshape_op.getType().cast<ShapedType>();
 
     if (!result_type.hasStaticShape()) return failure();
 
-    result_type = this->typeConverter->convertType(result_type)
-                      .template cast<ShapedType>();
+    result_type = typeConverter->convertType(result_type).cast<ShapedType>();
 
-    // Compute the reassociation maps for the linalg operation.
-    ArrayRef<int64_t> src_shape =
-        (operand_type.getRank() > result_type.getRank()
-             ? operand_type.getShape()
-             : result_type.getShape());
-    ArrayRef<int64_t> dst_shape =
-        (operand_type.getRank() > result_type.getRank()
-             ? result_type.getShape()
-             : operand_type.getShape());
-    unsigned curr_src_dim = 0, curr_dst_dim = 0;
-    SmallVector<ReassociationExprs, 4> reassociation_map(dst_shape.size());
-
-    // First scan all dimensions in the source shapes to see whether we have a
-    // perfect case where consecutive dimensions in source are collapsed. For
-    // such case we can just generate one single linalg.reshape.
-    bool is_collapsing_source = operand_type.hasStaticShape();
-    while (is_collapsing_source && curr_src_dim < src_shape.size() &&
-           curr_dst_dim < dst_shape.size()) {
-      int64_t dst_size = dst_shape[curr_dst_dim];
-      int64_t src_size = src_shape[curr_src_dim];
-      while (src_size < dst_size && curr_src_dim < src_shape.size()) {
-        reassociation_map[curr_dst_dim].push_back(
-            rewriter.getAffineDimExpr(curr_src_dim++));
-        src_size *= src_shape[curr_src_dim];
-      }
-      if (src_size == dst_size) {
-        reassociation_map[curr_dst_dim].push_back(
-            rewriter.getAffineDimExpr(curr_src_dim++));
-        // If the next dim in dst_shape is not 1, treat subsequent dims in
-        // src_shape which are 1 to be collapsed.
-        if (curr_dst_dim == dst_shape.size() - 1 ||
-            dst_shape[curr_dst_dim + 1] != 1) {
-          while (curr_src_dim < src_shape.size() &&
-                 src_shape[curr_src_dim] == 1) {
-            reassociation_map[curr_dst_dim].push_back(
-                rewriter.getAffineDimExpr(curr_src_dim++));
-          }
-        }
-      } else {
-        is_collapsing_source = false;
-        break;
-      }
-      curr_dst_dim++;
-    }
-    // Rank 0 can always use the direct lowering.
-    if (!src_shape.empty() && !dst_shape.empty() &&
-        (curr_src_dim != src_shape.size() || curr_dst_dim != dst_shape.size()))
-      is_collapsing_source = false;
-
-    // Otherwise, we need to first reduce all source dimensions into one and
-    // then expand to the destination dimensions.
-    if (!is_collapsing_source) {
-      auto get_identity_exprs = [&rewriter](int n) {
-        SmallVector<AffineExpr, 4> exprs;
-        for (int i = 0; i < n; ++i)
-          exprs.push_back(rewriter.getAffineDimExpr(i));
-        return exprs;
-      };
-      Location loc = reshape_op.getLoc();
-      int64_t total_elems = result_type.getNumElements();
-      auto elem_type = operand_type.getElementType();
-      SmallVector<ReassociationExprs, 4> collapsing_map = {
-          // Use operand_type here because we need to collapse all operands
-          // dimensions.
-          get_identity_exprs(operand_type.getShape().size())};
-      SmallVector<ReassociationExprs, 4> expanding_map = {
-          // Use result_type here because we need to expand to all result
-          // dimensions.
-          get_identity_exprs(result_type.getShape().size())};
-
-      if (isLHLO) {
-        auto collapsed_type = MemRefType::get({total_elems}, elem_type);
-        Value collapsed_op = rewriter.create<memref::CollapseShapeOp>(
-            loc, collapsed_type, adaptor.getOperands()[0], collapsing_map);
-        Value reshape_buffer = rewriter.create<memref::ExpandShapeOp>(
-            loc, result_type, collapsed_op, expanding_map);
-        rewriter.replaceOpWithNewOp<linalg::CopyOp>(reshape_op, reshape_buffer,
-                                                    adaptor.getOperands()[1]);
+    // Compute the reassociation maps for the linalg operation. This will
+    // succeed if the reshape can be done with a single expand_shape or
+    // collapse_shape.
+    if (Optional<SmallVector<ReassociationIndices>> reassociation_map =
+            getReassociationIndicesForReshape(operand_type, result_type)) {
+      if (result_type.getRank() < operand_type.getRank()) {
+        rewriter.replaceOpWithNewOp<linalg::TensorCollapseShapeOp>(
+            reshape_op, result_type, adaptor.operand(), *reassociation_map);
       } else {
-        Value collapsed_op = rewriter.create<linalg::TensorCollapseShapeOp>(
-            loc, adaptor.operand(), collapsing_map);
-        // Cast to a known static type if the input has dynamic dimensions.
-        auto collapsed_type = RankedTensorType::get({total_elems}, elem_type);
-        collapsed_op =
-            rewriter.create<tensor::CastOp>(loc, collapsed_type, collapsed_op);
         rewriter.replaceOpWithNewOp<linalg::TensorExpandShapeOp>(
-            reshape_op, result_type, collapsed_op, expanding_map);
+            reshape_op, result_type, adaptor.operand(), *reassociation_map);
       }
       return success();
     }
 
-    bool isCollapsing = result_type.getRank() < adaptor.getOperands()[0]
-                                                    .getType()
-                                                    .template cast<ShapedType>()
-                                                    .getRank();
-    if (isLHLO) {
-      Value reshape_buffer =
-          isCollapsing ? rewriter
-                             .create<memref::CollapseShapeOp>(
-                                 reshape_op.getLoc(), result_type,
-                                 adaptor.getOperands()[0], reassociation_map)
-                             .getResult()
-                       : rewriter
-                             .create<memref::ExpandShapeOp>(
-                                 reshape_op.getLoc(), result_type,
-                                 adaptor.getOperands()[0], reassociation_map)
-                             .getResult();
-      rewriter.replaceOpWithNewOp<linalg::CopyOp>(reshape_op, reshape_buffer,
-                                                  adaptor.getOperands()[1]);
-    } else {
-      if (isCollapsing) {
-        rewriter.replaceOpWithNewOp<linalg::TensorCollapseShapeOp>(
-            reshape_op, result_type, adaptor.getOperands()[0],
-            reassociation_map);
-      } else {
-        rewriter.replaceOpWithNewOp<linalg::TensorExpandShapeOp>(
-            reshape_op, result_type, adaptor.getOperands()[0],
-            reassociation_map);
-      }
-    }
+    // Otherwise, we need to first reduce all source dimensions into one and
+    // then expand to the destination dimensions.
+    auto get_identity_exprs = [&rewriter](int n) {
+      SmallVector<AffineExpr, 4> exprs;
+      for (int i = 0; i < n; ++i) exprs.push_back(rewriter.getAffineDimExpr(i));
+      return exprs;
+    };
+    Location loc = reshape_op.getLoc();
+    int64_t total_elems = result_type.getNumElements();
+    auto elem_type = operand_type.getElementType();
+    SmallVector<ReassociationExprs, 4> collapsing_map = {
+        // Use operand_type here because we need to collapse all operands
+        // dimensions.
+        get_identity_exprs(operand_type.getRank())};
+    SmallVector<ReassociationExprs, 4> expanding_map = {
+        // Use result_type here because we need to expand to all result
+        // dimensions.
+        get_identity_exprs(result_type.getRank())};
+
+    Value collapsed_op = rewriter.create<linalg::TensorCollapseShapeOp>(
+        loc, adaptor.operand(), collapsing_map);
+    // Cast to a known static type if the input has dynamic dimensions.
+    auto collapsed_type = RankedTensorType::get({total_elems}, elem_type);
+    collapsed_op =
+        rewriter.create<tensor::CastOp>(loc, collapsed_type, collapsed_op);
+    rewriter.replaceOpWithNewOp<linalg::TensorExpandShapeOp>(
+        reshape_op, result_type, collapsed_op, expanding_map);
     return success();
   }
 };
 
-template <typename OpTy, bool isLHLO = true>
+template <typename OpTy>
 class IotaConverter : public OpConversionPattern<OpTy> {
  public:
   using OpConversionPattern<OpTy>::OpConversionPattern;
@@ -1240,7 +979,7 @@ class IotaConverter : public OpConversionPattern<OpTy> {
   LogicalResult matchAndRewrite(
       OpTy iota_op, typename OpTy::Adaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
-    ShapedType result_shaped_type = GetHloOpResultType<isLHLO>(iota_op);
+    ShapedType result_shaped_type = GetHloOpResultType(iota_op);
     if (!result_shaped_type) return failure();
     result_shaped_type = this->typeConverter->convertType(result_shaped_type)
                              .template dyn_cast<ShapedType>();
@@ -1255,12 +994,12 @@ class IotaConverter : public OpConversionPattern<OpTy> {
     auto linalg_op = rewriter.create<linalg::GenericOp>(
         loc,
         /*resultTensorTypes=*/
-        isLHLO ? ArrayRef<Type>{} : ArrayRef<Type>{result_shaped_type},
+        ArrayRef<Type>{result_shaped_type},
         /*inputs=*/ValueRange{},
         /*outputBuffers=*/
-        isLHLO ? ValueRange{adaptor.getOperands().back()}
-               : ValueRange{GetInitTensorFor(rewriter, loc, result_shaped_type,
-                                             iota_op, adaptor.getOperands())},
+
+        ValueRange{GetInitTensorFor(rewriter, loc, result_shaped_type, iota_op,
+                                    adaptor.getOperands())},
         llvm::makeArrayRef(rewriter.getMultiDimIdentityMap(nloops)),
         GetNParallelLoopsAttrs(nloops),
         [&](OpBuilder& nested_builder, Location nested_loc, ValueRange args) {
@@ -1277,10 +1016,7 @@ class IotaConverter : public OpConversionPattern<OpTy> {
           nested_builder.create<linalg::YieldOp>(nested_loc, cast_op);
         },
         PruneAttributeList(iota_op));
-    if (isLHLO)
-      rewriter.replaceOp(iota_op, llvm::None);
-    else
-      rewriter.replaceOp(iota_op, linalg_op.result_tensors());
+    rewriter.replaceOp(iota_op, linalg_op.result_tensors());
     return success();
   }
 };
@@ -1375,25 +1111,6 @@ struct ConcatenateConverter : public OpConversionPattern<mhlo::ConcatenateOp> {
   }
 };
 
-class ConstConverterBuffer : public OpConversionPattern<lmhlo::ConstOp> {
- public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      lmhlo::ConstOp const_op, OpAdaptor adaptor,
-      ConversionPatternRewriter& rewriter) const final {
-    Location loc = const_op.getLoc();
-    auto value_attr = const_op.value().cast<DenseElementsAttr>();
-    if (value_attr.getType().getRank() != 0) return failure();
-    Value std_scalar_const = rewriter.create<mlir::arith::ConstantOp>(
-        loc, value_attr.getValues<Attribute>()[0]);
-    rewriter.create<mlir::AffineStoreOp>(loc, std_scalar_const,
-                                         const_op.getOperand(), llvm::None);
-    rewriter.eraseOp(const_op);
-    return success();
-  }
-};
-
 class ConstConverterTensor : public OpConversionPattern<mhlo::ConstOp> {
  public:
   using OpConversionPattern::OpConversionPattern;
@@ -1414,125 +1131,15 @@ class ConstConverterTensor : public OpConversionPattern<mhlo::ConstOp> {
   }
 };
 
-class ReduceConverter : public OpConversionPattern<lmhlo::ReduceOp> {
- public:
-  using OpConversionPattern<lmhlo::ReduceOp>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      lmhlo::ReduceOp reduce_op, OpAdaptor adaptor,
-      ConversionPatternRewriter& rewriter) const final {
-    auto loc = reduce_op.getLoc();
-    auto operand_shape =
-        adaptor.inputs()[0].getType().template dyn_cast<ShapedType>();
-    if (!operand_shape || !operand_shape.hasRank()) {
-      return rewriter.notifyMatchFailure(reduce_op, "expects known-rank args");
-    }
-
-    // First fill the output buffer with the init value.
-    for (auto it : llvm::zip(adaptor.init_values(), adaptor.out())) {
-      Value init_value = rewriter.create<memref::LoadOp>(loc, std::get<0>(it));
-      rewriter.create<linalg::FillOp>(loc, init_value, std::get<1>(it));
-    }
-
-    DenseIntElementsAttr dimensions_attr = reduce_op.dimensions();
-    SmallVector<int, 4> reduction_dims;
-    for (const auto& dim : dimensions_attr.getValues<APInt>()) {
-      reduction_dims.push_back(dim.getSExtValue());
-    }
-
-    SmallVector<AffineExpr, 2> src_exprs;
-    SmallVector<AffineExpr, 2> dst_exprs;
-    SmallVector<StringRef, 4> types;
-    for (int i = 0, rank = operand_shape.getRank(); i != rank; ++i) {
-      bool is_reduced = llvm::is_contained(reduction_dims, i);
-      types.push_back(is_reduced ? getReductionIteratorTypeName()
-                                 : getParallelIteratorTypeName());
-
-      src_exprs.push_back(mlir::getAffineDimExpr(i, rewriter.getContext()));
-      if (!is_reduced) {
-        dst_exprs.push_back(mlir::getAffineDimExpr(i, rewriter.getContext()));
-      }
-    }
-    SmallVector<ArrayRef<AffineExpr>, 4> affine_maps;
-    affine_maps.append(adaptor.inputs().size(), makeArrayRef(src_exprs));
-    affine_maps.append(adaptor.out().size(), makeArrayRef(dst_exprs));
-    auto maps = AffineMap::inferFromExprList(affine_maps);
-
-    auto linalg_op = rewriter.create<linalg::GenericOp>(
-        loc, /*resultTensorTypes=*/ArrayRef<Type>{},
-        /*inputs=*/adaptor.inputs(), /*outputBuffers=*/adaptor.out(), maps,
-        types, /*bodyBuild=*/nullptr, PruneAttributeList(reduce_op));
-    rewriter.inlineRegionBefore(reduce_op.body(), linalg_op.region(),
-                                linalg_op.region().end());
-    {
-      OpBuilder::InsertionGuard region_guard(rewriter);
-      Block* block = linalg_op.getBody();
-      rewriter.setInsertionPoint(&block->front());
-
-      // The incoming region is operating on buffers, while linalg.generic
-      // expects scalar SSA values. Add some allocs around the original op to
-      // make it compatible.
-      SmallVector<MemRefType, 4> mem_argv_tys;
-      SmallVector<Value, 4> alloc_values;
-      for (auto ty : block->getArgumentTypes()) {
-        mem_argv_tys.push_back(ty.cast<MemRefType>());
-        alloc_values.push_back(
-            rewriter.create<memref::AllocaOp>(loc, mem_argv_tys.back()));
-      }
-      size_t num_inputs =
-          adaptor.inputs().size() + adaptor.init_values().size();
-
-      // Now turn the existing signature
-      //   (memref<X>, memref<X>, memref<X>) -> ()
-      // into
-      //   (X, X) -> X
-      TypeConverter::SignatureConversion signature_converter(
-          alloc_values.size());
-      for (auto it : llvm::enumerate(alloc_values)) {
-        signature_converter.remapInput(it.index(), it.value());
-      }
-      for (auto ty : makeArrayRef(mem_argv_tys).take_front(num_inputs)) {
-        signature_converter.addInputs(ty.getElementType());
-      }
-
-      Block* entry_block = rewriter.applySignatureConversion(
-          &linalg_op.region(), signature_converter);
-
-      // Store the arguments into the newly allocated buffers.
-      rewriter.setInsertionPointAfter(alloc_values.back().getDefiningOp());
-      for (auto it :
-           enumerate(makeArrayRef(alloc_values).take_front(num_inputs))) {
-        rewriter.create<memref::StoreOp>(
-            loc, entry_block->getArgument(it.index()), it.value());
-      }
-      rewriter.replaceOp(entry_block->getTerminator(), {});
-
-      // Load & yield the result.
-      rewriter.setInsertionPointToEnd(entry_block);
-      auto output_values = makeArrayRef(alloc_values).slice(num_inputs);
-      SmallVector<Value, 4> load_results;
-      for (auto it : output_values) {
-        load_results.push_back(rewriter.create<memref::LoadOp>(loc, it));
-      }
-      rewriter.create<linalg::YieldOp>(loc, load_results);
-    }
-
-    rewriter.replaceOp(reduce_op, linalg_op.getOperation()->getResults());
-    return success();
-  }
-};
-
 // TODO(b/156787842): Support the lowering for dynamic shapes.
-template <typename OpTy, bool isLHLO = true>
+template <typename OpTy>
 class ReverseConverter
-    : public DataMovementOpConverter<ReverseConverter<OpTy, isLHLO>, OpTy,
-                                     isLHLO> {
+    : public DataMovementOpConverter<ReverseConverter<OpTy>, OpTy> {
  public:
-  using DataMovementOpConverter<ReverseConverter<OpTy, isLHLO>, OpTy,
-                                isLHLO>::DataMovementOpConverter;
+  using DataMovementOpConverter<ReverseConverter<OpTy>,
+                                OpTy>::DataMovementOpConverter;
   static SmallVector<AffineMap, 2> getIndexingMaps(OpTy op, Builder* b) {
-    auto result_type =
-        GetHloOpResultType<isLHLO>(op).template cast<ShapedType>();
+    auto result_type = GetHloOpResultType(op).template cast<ShapedType>();
     auto nloops = result_type.getRank();
     SmallVector<AffineExpr, 2> input_exprs;
     input_exprs.reserve(nloops);
@@ -1550,7 +1157,7 @@ class ReverseConverter
   }
 };
 
-template <typename OpTy, bool isLHLO = true>
+template <typename OpTy>
 class SliceConverter : public OpConversionPattern<OpTy> {
  public:
   using OpConversionPattern<OpTy>::OpConversionPattern;
@@ -1558,7 +1165,6 @@ class SliceConverter : public OpConversionPattern<OpTy> {
   LogicalResult matchAndRewrite(
       OpTy slice_op, typename OpTy::Adaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
-    auto loc = slice_op.getLoc();
     auto arg_type =
         adaptor.getOperands()[0].getType().template dyn_cast<ShapedType>();
     if (!arg_type || !arg_type.hasRank()) {
@@ -1579,15 +1185,8 @@ class SliceConverter : public OpConversionPattern<OpTy> {
           rewriter.getI64IntegerAttr((limit - 1 - start) / stride + 1));
       strides.push_back(rewriter.getI64IntegerAttr(stride));
     }
-    if (isLHLO) {
-      auto linalg_op = rewriter.create<memref::SubViewOp>(
-          loc, adaptor.getOperands()[0], offsets, sizes, strides);
-      rewriter.create<linalg::CopyOp>(loc, linalg_op, adaptor.getOperands()[1]);
-      rewriter.eraseOp(slice_op);
-    } else {
-      rewriter.replaceOpWithNewOp<tensor::ExtractSliceOp>(
-          slice_op, adaptor.getOperands()[0], offsets, sizes, strides);
-    }
+    rewriter.replaceOpWithNewOp<tensor::ExtractSliceOp>(
+        slice_op, adaptor.getOperands()[0], offsets, sizes, strides);
     return success();
   }
 };
@@ -1634,11 +1233,7 @@ class DynamicSliceConverter : public OpConversionPattern<mhlo::DynamicSliceOp> {
           loc, ub,
           rewriter.create<arith::ConstantOp>(
               loc, rewriter.getIntegerAttr(start_index.getType(), size)));
-      // TODO(hanchung): This is a workaround to use the method because only
-      // lmhlo version is defined. The implementation in
-      // map_lmhlo_to_scalar_op.h requires to pass a mhlo op. It will convert it
-      // to an lmhlo op and call the lmhlo implementation.
-      start_index = lmhlo::HloOpToStdScalarOp::map<lmhlo::ClampOp>(
+      start_index = mhlo::MhloOpToStdScalarOp::map<mhlo::ClampOp>(
           loc, start_index.getType(),
           ArrayRef<Type>{start_index.getType(), start_index.getType(),
                          start_index.getType()},
@@ -1709,11 +1304,7 @@ class DynamicUpdateSliceConverter
           loc, rewriter.getIntegerAttr(start_index_type,
                                        operand_type.getDimSize(en.index()) -
                                            update_type.getDimSize(en.index())));
-      // TODO(hanchung): This is a workaround to use the method because only
-      // lmhlo version is defined. The implementation in
-      // map_lmhlo_to_scalar_op.h requires to pass a mhlo op. It will convert it
-      // to an lmhlo op and call the lmhlo implementation.
-      start_index = lmhlo::HloOpToStdScalarOp::map<lmhlo::ClampOp>(
+      start_index = mhlo::MhloOpToStdScalarOp::map<mhlo::ClampOp>(
           loc, start_index_type,
           ArrayRef<Type>{start_index_type, start_index_type, start_index_type},
           ArrayRef<Value>{zero, start_index, ub}, &rewriter);
@@ -1794,7 +1385,7 @@ class DotOpOnTensorsConversion : public OpConversionPattern<mhlo::DotOp> {
   LogicalResult matchAndRewrite(
       mhlo::DotOp op, mhlo::DotOp::Adaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
-    if (!VerifyHloOpBufferOrTensorSemantics</*isLHLO=*/false>(op)) {
+    if (!VerifyHloOpBufferOrTensorSemantics(op)) {
       return failure();
     }
     if (GetDotOperationType(op) != op_type) return failure();
@@ -1835,7 +1426,7 @@ class DotGeneralOpOnTensorsConversion
   LogicalResult matchAndRewrite(
       mhlo::DotGeneralOp op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
-    if (!VerifyHloOpBufferOrTensorSemantics</*isLHLO=*/false>(op)) {
+    if (!VerifyHloOpBufferOrTensorSemantics(op)) {
       return failure();
     }
 
@@ -1902,7 +1493,7 @@ struct ReduceRegionXLAOpConversion : public OpConversionPattern<OpTy> {
         })) {
       return failure();
     }
-    Value result = lmhlo::HloOpToStdScalarOp::map<OpTy>(
+    Value result = mhlo::MhloOpToStdScalarOp::map<OpTy>(
         op, getElementTypeOrSelf(op.getType()), adaptor.getOperands(),
         &rewriter);
     rewriter.replaceOp(op, result);
@@ -2845,67 +2436,6 @@ struct ScatterUpdateOnTensorsConversion
   }
 };
 
-void populateLHLOToLinalgConversionPattern(MLIRContext* context,
-                                           TypeConverter& typeConverter,
-                                           OwningRewritePatternList* patterns) {
-  // clang-format off
-  patterns->insert<BroadcastConverter<lmhlo::BroadcastOp>,
-                   ConstConverterBuffer,
-                   IotaConverter<lmhlo::IotaOp>,
-                   LhloBroadcastInDimConverter,
-                   PointwiseToLinalgConverter<lmhlo::AbsOp>,
-                   PointwiseToLinalgConverter<lmhlo::AddOp>,
-                   PointwiseToLinalgConverter<lmhlo::AndOp>,
-                   PointwiseToLinalgConverter<lmhlo::Atan2Op>,
-                   PointwiseToLinalgConverter<lmhlo::BitcastConvertOp>,
-                   PointwiseToLinalgConverter<lmhlo::CeilOp>,
-                   PointwiseToLinalgConverter<lmhlo::ClampOp>,
-                   PointwiseToLinalgConverter<lmhlo::CompareOp>,
-                   PointwiseToLinalgConverter<lmhlo::ComplexOp>,
-                   PointwiseToLinalgConverter<lmhlo::ConvertOp>,
-                   // TODO(ataei): Remove this pattern, CopyOp is folded away.
-                   PointwiseToLinalgConverter<lmhlo::CopyOp>,
-                   PointwiseToLinalgConverter<lmhlo::CosOp>,
-                   PointwiseToLinalgConverter<lmhlo::DivOp>,
-                   PointwiseToLinalgConverter<lmhlo::ExpOp>,
-                   PointwiseToLinalgConverter<lmhlo::Expm1Op>,
-                   PointwiseToLinalgConverter<lmhlo::FloorOp>,
-                   PointwiseToLinalgConverter<lmhlo::ImagOp>,
-                   PointwiseToLinalgConverter<lmhlo::IsFiniteOp>,
-                   PointwiseToLinalgConverter<lmhlo::LogOp>,
-                   PointwiseToLinalgConverter<lmhlo::LogisticOp>,
-                   PointwiseToLinalgConverter<lmhlo::Log1pOp>,
-                   PointwiseToLinalgConverter<lmhlo::MaxOp>,
-                   PointwiseToLinalgConverter<lmhlo::MinOp>,
-                   PointwiseToLinalgConverter<lmhlo::MulOp>,
-                   PointwiseToLinalgConverter<lmhlo::NegOp>,
-                   PointwiseToLinalgConverter<lmhlo::NotOp>,
-                   PointwiseToLinalgConverter<lmhlo::OrOp>,
-                   PointwiseToLinalgConverter<lmhlo::PowOp>,
-                   PointwiseToLinalgConverter<lmhlo::RealOp>,
-                   PointwiseToLinalgConverter<lmhlo::RemOp>,
-                   PointwiseToLinalgConverter<lmhlo::RsqrtOp>,
-                   PointwiseToLinalgConverter<lmhlo::SelectOp>,
-                   PointwiseToLinalgConverter<lmhlo::ShiftLeftOp>,
-                   PointwiseToLinalgConverter<lmhlo::ShiftRightArithmeticOp>,
-                   PointwiseToLinalgConverter<lmhlo::ShiftRightLogicalOp>,
-                   PointwiseToLinalgConverter<lmhlo::SignOp>,
-                   PointwiseToLinalgConverter<lmhlo::SinOp>,
-                   PointwiseToLinalgConverter<lmhlo::SqrtOp>,
-                   PointwiseToLinalgConverter<lmhlo::SubOp>,
-                   PointwiseToLinalgConverter<lmhlo::TanhOp>,
-                   PointwiseToLinalgConverter<lmhlo::XorOp>,
-                   ReduceConverter,
-                   ReshapeOpConverter<lmhlo::ReshapeOp>,
-                   ReverseConverter<lmhlo::ReverseOp>,
-                   ScalarPointwiseToStandardConverter<lmhlo::AddOp>,
-                   ScalarPointwiseToStandardConverter<lmhlo::MaxOp>,
-                   SliceConverter<lmhlo::SliceOp>,
-                   TransposeConverter<lmhlo::TransposeOp>
-                  >(typeConverter, context);
-  // clang-format on
-}
-
 struct ComputeReshapeShapeConversion
     : public OpConversionPattern<mhlo::ComputeReshapeShapeOp> {
   using OpConversionPattern<mhlo::ComputeReshapeShapeOp>::OpConversionPattern;
@@ -3057,50 +2587,6 @@ struct CstrReshapableConversion
   }
 };
 
-// Converts LHLO ops to Linalg generic.
-// Sample result for lmhlo::AddOp.
-//
-// "lmhlo.add"(%arg1, %arg2, %out) :
-//      (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
-//
-// will be converted to
-//
-// #map0 = (d0, d1) -> (d0, d1)
-// "linalg.generic"(%arg1, %arg2, %out) ( {
-//   ^bb0(%arg4: f32, %arg5: f32):
-//     %0 = arith.addf %arg4, %arg5 : f32
-//     "linalg.yield"(%0) : (f32) -> ()
-// }) {
-//     indexing_maps = [#map0, #map0, #map0],
-//     iterator_types = ["parallel", "parallel"],
-// } : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
-struct LhloLegalizeToLinalgPass
-    : public lmhlo::LhloLegalizeToLinalgPassBase<LhloLegalizeToLinalgPass> {
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry
-        .insert<AffineDialect, complex::ComplexDialect, linalg::LinalgDialect,
-                math::MathDialect, memref::MemRefDialect>();
-  }
-
-  void runOnFunction() override {
-    OwningRewritePatternList patterns(&getContext());
-    ConversionTarget target(getContext());
-    target.addLegalDialect<arith::ArithmeticDialect, complex::ComplexDialect,
-                           linalg::LinalgDialect, math::MathDialect,
-                           memref::MemRefDialect, StandardOpsDialect,
-                           AffineDialect>();
-    target.addLegalOp<UnrealizedConversionCastOp>();
-
-    mhlo::RemoveSignTypeConverter type_converter;
-    auto func = getFunction();
-    populateLHLOToLinalgConversionPattern(func.getContext(), type_converter,
-                                          &patterns);
-    if (failed(applyPartialConversion(func, target, std::move(patterns)))) {
-      signalPassFailure();
-    }
-  }
-};
-
 struct HloLegalizeToLinalgPass
     : public mhlo::HloLegalizeToLinalgPassBase<HloLegalizeToLinalgPass> {
   void getDependentDialects(DialectRegistry& registry) const override {
@@ -3131,12 +2617,6 @@ struct HloLegalizeToLinalgPass
 
 }  // namespace
 
-namespace lmhlo {
-std::unique_ptr<OperationPass<FuncOp>> createLegalizeLhloToLinalgPass() {
-  return std::make_unique<LhloLegalizeToLinalgPass>();
-}
-}  // namespace lmhlo
-
 namespace mhlo {
 
 void populateHLOToLinalgConversionPattern(MLIRContext* context,
@@ -3144,60 +2624,60 @@ void populateHLOToLinalgConversionPattern(MLIRContext* context,
                                           OwningRewritePatternList* patterns) {
   // clang-format off
   patterns->insert<
-      BroadcastConverter<mhlo::BroadcastOp, false>, ConcatenateConverter,
+      BroadcastConverter<mhlo::BroadcastOp>, ConcatenateConverter,
       ConstConverterTensor, HloDynamicBroadcastInDimConverter,
-      HloBroadcastInDimConverter, IotaConverter<mhlo::IotaOp, false>,
+      HloBroadcastInDimConverter, IotaConverter<mhlo::IotaOp>,
       EinsumToLinalgConverter,
-      IotaConverter<mhlo::DynamicIotaOp, false>,
-      PointwiseToLinalgConverter<mhlo::AbsOp, false>,
-      PointwiseToLinalgConverter<mhlo::AddOp, false>,
-      PointwiseToLinalgConverter<mhlo::AndOp, false>,
-      PointwiseToLinalgConverter<mhlo::Atan2Op, false>,
-      PointwiseToLinalgConverter<mhlo::BitcastConvertOp, false>,
-      PointwiseToLinalgConverter<mhlo::CeilOp, false>,
-      PointwiseToLinalgConverter<mhlo::ClampOp, false>,
-      PointwiseToLinalgConverter<mhlo::CompareOp, false>,
-      PointwiseToLinalgConverter<mhlo::ComplexOp, false>,
-      PointwiseToLinalgConverter<mhlo::ConvertOp, false>,
-      PointwiseToLinalgConverter<mhlo::CopyOp, false>,
-      PointwiseToLinalgConverter<mhlo::CosOp, false>,
-      PointwiseToLinalgConverter<mhlo::DivOp, false>,
-      PointwiseToLinalgConverter<mhlo::ExpOp, false>,
-      PointwiseToLinalgConverter<mhlo::Expm1Op, false>,
-      PointwiseToLinalgConverter<mhlo::FloorOp, false>,
-      PointwiseToLinalgConverter<mhlo::ImagOp, false>,
-      PointwiseToLinalgConverter<mhlo::IsFiniteOp, false>,
-      PointwiseToLinalgConverter<mhlo::LogOp, false>,
-      PointwiseToLinalgConverter<mhlo::LogisticOp, false>,
-      PointwiseToLinalgConverter<mhlo::Log1pOp, false>,
-      PointwiseToLinalgConverter<mhlo::MaxOp, false>,
-      PointwiseToLinalgConverter<mhlo::MinOp, false>,
-      PointwiseToLinalgConverter<mhlo::MulOp, false>,
-      PointwiseToLinalgConverter<mhlo::NegOp, false>,
-      PointwiseToLinalgConverter<mhlo::NotOp, false>,
-      PointwiseToLinalgConverter<mhlo::OrOp, false>,
-      PointwiseToLinalgConverter<mhlo::PowOp, false>,
-      PointwiseToLinalgConverter<mhlo::RealOp, false>,
-      PointwiseToLinalgConverter<mhlo::RemOp, false>,
-      PointwiseToLinalgConverter<mhlo::RsqrtOp, false>,
-      PointwiseToLinalgConverter<mhlo::SelectOp, false>,
-      PointwiseToLinalgConverter<mhlo::ShiftLeftOp, false>,
-      PointwiseToLinalgConverter<mhlo::ShiftRightArithmeticOp, false>,
-      PointwiseToLinalgConverter<mhlo::ShiftRightLogicalOp, false>,
-      PointwiseToLinalgConverter<mhlo::SignOp, false>,
-      PointwiseToLinalgConverter<mhlo::SinOp, false>,
-      PointwiseToLinalgConverter<mhlo::SqrtOp, false>,
-      PointwiseToLinalgConverter<mhlo::SubOp, false>,
-      PointwiseToLinalgConverter<mhlo::TanhOp, false>,
-      PointwiseToLinalgConverter<mhlo::XorOp, false>,
-      ReshapeOpConverter<mhlo::ReshapeOp, false>,
-      ReverseConverter<mhlo::ReverseOp, false>,
-      SliceConverter<mhlo::SliceOp, false>,
+      IotaConverter<mhlo::DynamicIotaOp>,
+      PointwiseToLinalgConverter<mhlo::AbsOp>,
+      PointwiseToLinalgConverter<mhlo::AddOp>,
+      PointwiseToLinalgConverter<mhlo::AndOp>,
+      PointwiseToLinalgConverter<mhlo::Atan2Op>,
+      PointwiseToLinalgConverter<mhlo::BitcastConvertOp>,
+      PointwiseToLinalgConverter<mhlo::CeilOp>,
+      PointwiseToLinalgConverter<mhlo::ClampOp>,
+      PointwiseToLinalgConverter<mhlo::CompareOp>,
+      PointwiseToLinalgConverter<mhlo::ComplexOp>,
+      PointwiseToLinalgConverter<mhlo::ConvertOp>,
+      PointwiseToLinalgConverter<mhlo::CopyOp>,
+      PointwiseToLinalgConverter<mhlo::CosOp>,
+      PointwiseToLinalgConverter<mhlo::DivOp>,
+      PointwiseToLinalgConverter<mhlo::ExpOp>,
+      PointwiseToLinalgConverter<mhlo::Expm1Op>,
+      PointwiseToLinalgConverter<mhlo::FloorOp>,
+      PointwiseToLinalgConverter<mhlo::ImagOp>,
+      PointwiseToLinalgConverter<mhlo::IsFiniteOp>,
+      PointwiseToLinalgConverter<mhlo::LogOp>,
+      PointwiseToLinalgConverter<mhlo::LogisticOp>,
+      PointwiseToLinalgConverter<mhlo::Log1pOp>,
+      PointwiseToLinalgConverter<mhlo::MaxOp>,
+      PointwiseToLinalgConverter<mhlo::MinOp>,
+      PointwiseToLinalgConverter<mhlo::MulOp>,
+      PointwiseToLinalgConverter<mhlo::NegOp>,
+      PointwiseToLinalgConverter<mhlo::NotOp>,
+      PointwiseToLinalgConverter<mhlo::OrOp>,
+      PointwiseToLinalgConverter<mhlo::PowOp>,
+      PointwiseToLinalgConverter<mhlo::RealOp>,
+      PointwiseToLinalgConverter<mhlo::RemOp>,
+      PointwiseToLinalgConverter<mhlo::RsqrtOp>,
+      PointwiseToLinalgConverter<mhlo::SelectOp>,
+      PointwiseToLinalgConverter<mhlo::ShiftLeftOp>,
+      PointwiseToLinalgConverter<mhlo::ShiftRightArithmeticOp>,
+      PointwiseToLinalgConverter<mhlo::ShiftRightLogicalOp>,
+      PointwiseToLinalgConverter<mhlo::SignOp>,
+      PointwiseToLinalgConverter<mhlo::SinOp>,
+      PointwiseToLinalgConverter<mhlo::SqrtOp>,
+      PointwiseToLinalgConverter<mhlo::SubOp>,
+      PointwiseToLinalgConverter<mhlo::TanhOp>,
+      PointwiseToLinalgConverter<mhlo::XorOp>,
+      ReshapeOpConverter,
+      ReverseConverter<mhlo::ReverseOp>,
+      SliceConverter<mhlo::SliceOp>,
       ComputeReshapeShapeConversion,
       CstrReshapableConversion,
       DynamicSliceConverter,
       DynamicUpdateSliceConverter,
-      TransposeConverter<mhlo::TransposeOp, false>,
+      TransposeConverter<mhlo::TransposeOp>,
       DotOpOnTensorsConversion<DotOperationType::kMatrixMatrix,
                                linalg::MatmulOp>,
       DotOpOnTensorsConversion<DotOperationType::kMatrixVector,
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_roots_to_loops.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_roots_to_loops.cc
index c8fb5d9ae2d735..94a6dddc7ecbaf 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_roots_to_loops.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_roots_to_loops.cc
@@ -66,7 +66,7 @@ LogicalResult elemwiseLowerHelper(
         loc, &b, operand_memref, multidim_index, b.saveInsertionPoint());
     operand_values.push_back(operand_data);
   }
-  auto res = HloOpToStdScalarOp::map<LHLO_OpTy>(
+  auto res = LhloOpToStdScalarOp::map<LHLO_OpTy>(
       llvm::cast<LHLO_OpTy>(op),
       result_memref.getType().cast<MemRefType>().getElementType(),
       operand_values, &b);
diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc
index ab2865cc5814e4..ab20177dcc5bcb 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/lhlo_legalize_to_affine.cc
@@ -81,7 +81,7 @@ struct DotOpConverter : public OpRewritePattern<DotOp> {
       auto r = builder.create<AffineLoadOp>(loc, rhs, rhs_indices);
       auto result =
           rewriter.create<AffineLoadOp>(loc, op.output(), result_indices);
-      Value op_result = lmhlo::HloOpToStdScalarOp::map<DotOp>(
+      Value op_result = lmhlo::LhloOpToStdScalarOp::map<DotOp>(
           op, element_type, {l, r, result}, &builder);
       map_status = success(op_result != nullptr);
       if (failed(map_status)) return;
@@ -483,7 +483,7 @@ struct BinaryOpConverter : public OpRewritePattern<LhloOpTy> {
                             ValueRange induction_vars) {
       auto l = builder.create<AffineLoadOp>(loc, lhs, induction_vars);
       auto r = builder.create<AffineLoadOp>(loc, rhs, induction_vars);
-      Value op_result = lmhlo::HloOpToStdScalarOp::map<LhloOpTy>(
+      Value op_result = lmhlo::LhloOpToStdScalarOp::map<LhloOpTy>(
           op, element_type, {l, r}, &builder);
       map_status = success(op_result != nullptr);
       if (failed(map_status)) return;
@@ -517,7 +517,7 @@ struct UnaryOpConverter : public OpRewritePattern<LhloOpTy> {
                             ValueRange induction_vars) {
       Value loadInput =
           builder.create<AffineLoadOp>(loc, input, induction_vars);
-      Value opResult = lmhlo::HloOpToStdScalarOp::map<LhloOpTy>(
+      Value opResult = lmhlo::LhloOpToStdScalarOp::map<LhloOpTy>(
           op, elementType, {loadInput}, &builder);
       map_status = success(opResult != nullptr);
       if (failed(map_status)) return;
diff --git a/tensorflow/compiler/mlir/hlo/lib/Transforms/reshape_simplifier.cc b/tensorflow/compiler/mlir/hlo/lib/Transforms/reshape_simplifier.cc
index 34bbed04d9bc4e..373e389ec0ae1b 100644
--- a/tensorflow/compiler/mlir/hlo/lib/Transforms/reshape_simplifier.cc
+++ b/tensorflow/compiler/mlir/hlo/lib/Transforms/reshape_simplifier.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <algorithm>
 
 #include "mlir-hlo/Analysis/shape_component_analysis.h"
+#include "mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "mlir-hlo/Transforms/PassDetail.h"
 #include "mlir-hlo/Transforms/passes.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
@@ -27,6 +28,10 @@ limitations under the License.
 
 namespace mlir {
 
+using ShapeOrValueInfo = ShapeComponentAnalysis::ShapeOrValueInfo;
+using Symbol = ShapeComponentAnalysis::Symbol;
+using SymbolicExpr = ShapeComponentAnalysis::SymbolicExpr;
+
 namespace {
 
 // Returns true if `reshape` only adds `1` dimensions.
@@ -99,6 +104,46 @@ struct RemoveComputeReshapeShape final
   }
 };
 
+bool IsSimpleProduct(
+    AffineExpr expr,
+    llvm::function_ref<void(AffineConstantExpr)> cbkConstantFactor,
+    llvm::function_ref<void(AffineSymbolExpr)> cbkSymbolicFactor) {
+  auto binExpr = expr.dyn_cast<AffineBinaryOpExpr>();
+  if (binExpr && binExpr.getKind() == AffineExprKind::Mul) {
+    return IsSimpleProduct(binExpr.getLHS(), cbkConstantFactor,
+                           cbkSymbolicFactor) &&
+           IsSimpleProduct(binExpr.getRHS(), cbkConstantFactor,
+                           cbkSymbolicFactor);
+  }
+  if (auto symExpr = expr.dyn_cast<AffineSymbolExpr>()) {
+    cbkSymbolicFactor(symExpr);
+    return true;
+  }
+  if (auto constExpr = expr.dyn_cast<AffineConstantExpr>()) {
+    cbkConstantFactor(constExpr);
+    return true;
+  }
+  return false;
+}
+
+bool IsSimpleProduct(const SymbolicExpr &symbolicExpr,
+                     llvm::function_ref<void(int64_t)> cbkConstantFactor,
+                     llvm::function_ref<void(Symbol)> cbkSymbolicFactor) {
+  return IsSimpleProduct(
+      symbolicExpr.expr,
+      [&](AffineConstantExpr cexpr) { cbkConstantFactor(cexpr.getValue()); },
+      [&](AffineSymbolExpr sexpr) {
+        cbkSymbolicFactor(symbolicExpr.symbols[sexpr.getPosition()]);
+      });
+}
+
+bool IsSimpleProduct(const SymbolicExpr &symbolicExpr, int64_t *concreteProduct,
+                     SmallVectorImpl<Symbol> *symbolicFactors) {
+  return IsSimpleProduct(
+      symbolicExpr, [&](int64_t c) { *concreteProduct *= c; },
+      [&](Symbol s) { symbolicFactors->push_back(s); });
+}
+
 struct RemoveRedundantCstrReshapable final
     : public OpRewritePattern<mhlo::CstrReshapableOp> {
   RemoveRedundantCstrReshapable(MLIRContext *ctx) : OpRewritePattern(ctx) {}
@@ -132,11 +177,10 @@ struct RemoveRedundantCstrReshapable final
 
     // We can only handle simple products with constants and symbols. Find all
     // the factors based on the number of elements.
-    SmallVector<AffineSymbolExpr> remainingSymbolicFactorsNumElems;
     int64_t concreteProductNumElems = 1;
-    if (!IsSimpleProduct(numElements.expr, &concreteProductNumElems,
-                         &remainingSymbolicFactorsNumElems,
-                         /*ignore_negative=*/false)) {
+    SmallVector<Symbol> remainingSymbolicFactorsNumElems;
+    if (!IsSimpleProduct(numElements, &concreteProductNumElems,
+                         &remainingSymbolicFactorsNumElems)) {
       return failure();
     }
     assert(concreteProductNumElems >= 1 &&
@@ -149,29 +193,20 @@ struct RemoveRedundantCstrReshapable final
     //     factor, i.e. if the symbolic factors based on the dynamic shape are
     //     not a subset of the factors based on the number of elements.
     int64_t concreteProductDynShape = 1;
-    for (auto d : *dynShapeDims) {
-      SmallVector<AffineSymbolExpr> partialSymbolicFactorsDynShape;
-      if (!IsSimpleProduct(d.expr, &concreteProductDynShape,
-                           &partialSymbolicFactorsDynShape,
-                           /*ignore_negative=*/true)) {
+    for (auto dim : *dynShapeDims) {
+      SmallVector<Symbol> partialSymbolicFactorsDynShape;
+      if (!IsSimpleProduct(
+              dim,
+              [&](int64_t c) {
+                if (c != -1) concreteProductDynShape *= c;
+              },
+              [&](Symbol s) { partialSymbolicFactorsDynShape.push_back(s); })) {
         return failure();
       }
-      for (const AffineSymbolExpr &symExpr : partialSymbolicFactorsDynShape) {
-        auto symDynShape = d.symbols[symExpr.getPosition()];
-        bool isFactorInBothProducts = false;
-        for (int i = 0; i < remainingSymbolicFactorsNumElems.size(); ++i) {
-          auto symNumElements =
-              numElements
-                  .symbols[remainingSymbolicFactorsNumElems[i].getPosition()];
-          if (symDynShape == symNumElements) {
-            remainingSymbolicFactorsNumElems[i] =
-                remainingSymbolicFactorsNumElems.back();
-            remainingSymbolicFactorsNumElems.pop_back();
-            isFactorInBothProducts = true;
-            break;
-          }
-        }
-        if (!isFactorInBothProducts) return failure();
+      for (const Symbol &symDynShape : partialSymbolicFactorsDynShape) {
+        auto it = llvm::find(remainingSymbolicFactorsNumElems, symDynShape);
+        if (it == remainingSymbolicFactorsNumElems.end()) return failure();
+        remainingSymbolicFactorsNumElems.erase(it);
       }
     }
     assert(concreteProductDynShape >= 1 &&
@@ -192,27 +227,87 @@ struct RemoveRedundantCstrReshapable final
     rewriter.replaceOpWithNewOp<shape::ConstWitnessOp>(op, isReshapable);
     return success();
   }
-  bool IsSimpleProduct(AffineExpr expr, int64_t *concreteProduct,
-                       SmallVectorImpl<AffineSymbolExpr> *symbolicFactors,
-                       bool ignore_minus_one) const {
-    auto binExpr = expr.dyn_cast<AffineBinaryOpExpr>();
-    if (binExpr && binExpr.getKind() == AffineExprKind::Mul) {
-      return IsSimpleProduct(binExpr.getLHS(), concreteProduct, symbolicFactors,
-                             ignore_minus_one) &&
-             IsSimpleProduct(binExpr.getRHS(), concreteProduct, symbolicFactors,
-                             ignore_minus_one);
-    }
-    if (auto symExpr = expr.dyn_cast<AffineSymbolExpr>()) {
-      symbolicFactors->push_back(symExpr);
-      return true;
-    }
-    if (auto constExpr = expr.dyn_cast<AffineConstantExpr>()) {
-      int64_t c = constExpr.getValue();
-      if (c == -1 && ignore_minus_one) return true;
-      *concreteProduct *= c;
-      return true;
+};
+
+struct TurnDynamicReshapeIntoCollapseShape final
+    : public OpRewritePattern<mhlo::DynamicReshapeOp> {
+  TurnDynamicReshapeIntoCollapseShape(MLIRContext *ctx)
+      : OpRewritePattern(ctx) {}
+  LogicalResult matchAndRewrite(mhlo::DynamicReshapeOp op,
+                                PatternRewriter &rewriter) const override {
+    // Require sucessful shape analysis for operand and shape.
+    ShapeComponentAnalysis shapeComponentAnalysis;
+    auto argShapeInfo = shapeComponentAnalysis.GetShapeInfo(op.operand());
+    if (!argShapeInfo) return failure();
+    auto shapeInfo = shapeComponentAnalysis.GetValueInfo(op.output_shape());
+    if (!shapeInfo) return failure();
+
+    // The next dimension of the operand shape to look at.
+    int i = 0;
+
+    // For each dimension of the target shape, consume the matching dimensions
+    // of the operand shape and build the reassociation map on the fly.
+    SmallVector<ReassociationIndices> reassociation_map;
+    for (const auto &shapeDim : *shapeInfo) {
+      reassociation_map.push_back({});
+
+      // Find the concrete/symbolic factors for the current dimension of the
+      // target shape.
+      int64_t remainingConcreteProductShapeDim = 1;
+      SmallVector<Symbol> remainingSymbolicFactorsShapeDim;
+      if (!IsSimpleProduct(shapeDim, &remainingConcreteProductShapeDim,
+                           &remainingSymbolicFactorsShapeDim)) {
+        return failure();
+      }
+
+      // Consume (and collapse) as many of the operand dimensions as needed to
+      // match the target dimension. This is monotonic.
+      while (remainingConcreteProductShapeDim != 1 ||
+             !remainingSymbolicFactorsShapeDim.empty()) {
+        // Fail if there are no more operand dimensions to consume.
+        if (i >= argShapeInfo->size()) return failure();
+
+        // Find the concrete/symbolic factors for the next dimension of the
+        // operand shape.
+        int64_t concreteProductArgShapeDim = 1;
+        SmallVector<Symbol> symbolicFactorsArgShapeDim;
+        if (!IsSimpleProduct((*argShapeInfo)[i], &concreteProductArgShapeDim,
+                             &symbolicFactorsArgShapeDim)) {
+          return failure();
+        }
+
+        // Eliminate the common concrete factors. Fail if we cannot consume a
+        // concrete factor of the operand shape.
+        if (remainingConcreteProductShapeDim % concreteProductArgShapeDim != 0)
+          return failure();
+        remainingConcreteProductShapeDim /= concreteProductArgShapeDim;
+
+        // Eliminate the common symbolic factors. Fail if we cannot consume a
+        // symbolic factor of the operand shape.
+        for (const Symbol &symArgShapeDim : symbolicFactorsArgShapeDim) {
+          auto it =
+              llvm::find(remainingSymbolicFactorsShapeDim, symArgShapeDim);
+          if (it == remainingSymbolicFactorsShapeDim.end()) return failure();
+          remainingSymbolicFactorsShapeDim.erase(it);
+        }
+
+        // If all the concrete/symbolic factors were consumable, collapse this
+        // dimension (and continue if needed).
+        reassociation_map.back().push_back(i++);
+      }
+
+      // Consume trailing 1 dimensions.
+      while (i < argShapeInfo->size() && (*argShapeInfo)[i].isConstant(1))
+        reassociation_map.back().push_back(i++);
     }
-    return false;
+
+    // Fail if not all of the operand shape could be consumed.
+    if (i < argShapeInfo->size()) return failure();
+
+    // Replace reshape op with its equivalent collapse shape op.
+    rewriter.replaceOpWithNewOp<linalg::TensorCollapseShapeOp>(
+        op, op.operand(), reassociation_map);
+    return success();
   }
 };
 
@@ -236,7 +331,8 @@ void ReshapeSimplifierPass::runOnFunction() {
   patterns.insert<
       ReshapeToExpandShape,
       RemoveComputeReshapeShape,
-      RemoveRedundantCstrReshapable>(ctx);
+      RemoveRedundantCstrReshapable,
+      TurnDynamicReshapeIntoCollapseShape>(ctx);
   // clang-format on
   shape::AssumingOp::getCanonicalizationPatterns(patterns, ctx);
 
diff --git a/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir b/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir
index aac6d089916740..2a406c6f1908af 100644
--- a/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/canonicalize.mlir
@@ -814,6 +814,15 @@ func @dont_fold_compare_same_eq_float(%arg0: tensor<f16>) -> tensor<i1> {
   return %0 : tensor<i1>
 }
 
+// Address NaN != NaN for complex types.
+// CHECK-LABEL: dont_fold_compare_same_eq_complex
+func @dont_fold_compare_same_eq_complex(%arg0: tensor<complex<f32>>) -> tensor<i1> {
+  // CHECK: %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "EQ"} : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<i1>
+  %0 = "mhlo.compare"(%arg0, %arg0) {comparison_direction = "EQ"} : (tensor<complex<f32>>, tensor<complex<f32>>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+
+
 // CHECK-LABEL: fold_compare_false_eq
 func @fold_compare_false_eq() -> tensor<i1> {
   %0 = mhlo.constant dense<0> : tensor<i32>
diff --git a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir
index f85397439f8a53..7491f9d1614642 100644
--- a/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/hlo-legalize-to-linalg.mlir
@@ -706,7 +706,7 @@ func @reshape_3D_2D(%arg0: tensor<12x1x42xi32>) -> tensor<12x42xi32> {
   %0 = "mhlo.reshape"(%arg0) : (tensor<12x1x42xi32>) -> tensor<12x42xi32>
   return %0 : tensor<12x42xi32>
 }
-// CHECK: linalg.tensor_collapse_shape %{{.*}} {{\[}}[0, 1], [2]]
+// CHECK: linalg.tensor_collapse_shape %{{.*}} {{\[}}[0], [1, 2]]
 
 // -----
 
@@ -724,7 +724,7 @@ func @reshape_2D_4D(%arg0: tensor<12x42xi32>) -> tensor<12x1x42x1xi32> {
   %0 = "mhlo.reshape"(%arg0) : (tensor<12x42xi32>) -> tensor<12x1x42x1xi32>
   return %0 : tensor<12x1x42x1xi32>
 }
-// CHECK: linalg.tensor_expand_shape %{{.*}} {{\[}}[0, 1], [2, 3]]
+// CHECK: linalg.tensor_expand_shape %{{.*}} {{\[}}[0], [1, 2, 3]]
 
 // -----
 
diff --git a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir
deleted file mode 100644
index 87b57da49951cb..00000000000000
--- a/tensorflow/compiler/mlir/hlo/tests/lhlo-legalize-to-linalg.mlir
+++ /dev/null
@@ -1,1263 +0,0 @@
-// RUN: mlir-hlo-opt %s -lhlo-legalize-to-linalg -split-input-file | FILECHECK_OPTS="" FileCheck %s
-
-// CHECK: #map = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK-LABEL: func @element_wise
-func @element_wise(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>,
-                   %result: memref<2x2xf32>) {
-  "lmhlo.power"(%lhs, %rhs, %result)
-      : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[LHS_IN:.*]]: f32, %[[RHS_IN:.*]]: f32, %[[RESULT_OUT:.*]]: f32):
-// CHECK-NEXT:   %[[RESULT:.*]] = math.powf %[[LHS_IN]], %[[RHS_IN]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK: #map = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK-LABEL: func @element_wise
-func @element_wise(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>,
-                   %result: memref<2x2xf32>) {
-  "lmhlo.add"(%lhs, %rhs, %result)
-      : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[LHS_IN:.*]]: f32, %[[RHS_IN:.*]]: f32, %[[RESULT_OUT:.*]]: f32):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.addf %[[LHS_IN]], %[[RHS_IN]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @element_wise_with_dynamic_shape
-func @element_wise_with_dynamic_shape(%lhs: memref<?x?xf32>,
-                                       %rhs: memref<?x?xf32>,
-                                       %result: memref<?x?xf32>) {
-  "lmhlo.add"(%lhs, %rhs, %result)
-      : (memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[LHS_IN:.*]]: f32, %[[RHS_IN:.*]]: f32, %[[RESULT_OUT:.*]]: f32):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.addf %[[LHS_IN]], %[[RHS_IN]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @element_wise_scalar
-func @element_wise_scalar(%lhs: memref<f32>, %rhs: memref<f32>,
-                          %result: memref<f32>) {
-  "lmhlo.add"(%lhs, %rhs, %result)
-      : (memref<f32>, memref<f32>, memref<f32>) -> ()
-  return
-}
-// CHECK: %[[LHS:.*]] = memref.load
-// CHECK: %[[RHS:.*]] = memref.load
-// CHECK: %[[RES:.*]] = arith.addf %[[LHS]], %[[RHS]]
-// CHECK: memref.store %[[RES]]
-// CHECK-NEXT: return
-
-// -----
-
-// CHECK-LABEL: func @minf
-func @minf(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>,
-           %result: memref<2x2xf32>) {
-  "lmhlo.minimum"(%lhs, %rhs, %result)
-      : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[LHS_IN:.*]]: f32, %[[RHS_IN:.*]]: f32, %[[RESULT_OUT:.*]]: f32):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.minf %[[LHS_IN]], %[[RHS_IN]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @maxi
-func @maxi(%lhs: memref<2x2xi32>, %rhs: memref<2x2xi32>,
-           %result: memref<2x2xi32>) {
-  "lmhlo.maximum"(%lhs, %rhs, %result)
-      : (memref<2x2xi32>, memref<2x2xi32>, memref<2x2xi32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[LHS_IN:.*]]: i32, %[[RHS_IN:.*]]: i32, %[[RESULT_OUT:.*]]: i32):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.maxsi %[[LHS_IN]], %[[RHS_IN]] : i32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : i32
-
-// -----
-
-// CHECK-LABEL: func @maxu
-func @maxu(%lhs: memref<2x2xui32>, %rhs: memref<2x2xui32>,
-           %result: memref<2x2xui32>) {
-  "lmhlo.maximum"(%lhs, %rhs, %result)
-      : (memref<2x2xui32>, memref<2x2xui32>, memref<2x2xui32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[LHS_IN:.*]]: i32, %[[RHS_IN:.*]]: i32, %[[RESULT_OUT:.*]]: i32):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.maxui %[[LHS_IN]], %[[RHS_IN]] : i32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : i32
-
-// -----
-
-// CHECK-LABEL: func @and
-func @and(%lhs: memref<2x2xi32>, %rhs: memref<2x2xi32>,
-          %result: memref<2x2xi32>) {
-  "lmhlo.and"(%lhs, %rhs, %result)
-      : (memref<2x2xi32>, memref<2x2xi32>, memref<2x2xi32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[LHS_IN:.*]]: i32, %[[RHS_IN:.*]]: i32, %[[RESULT_OUT:.*]]: i32):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.andi %[[LHS_IN]], %[[RHS_IN]] : i32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : i32
-
-// -----
-
-// CHECK-LABEL: func @exp
-func @exp(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
-  "lmhlo.exponential"(%input, %result)
-      : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[RESULT:.*]] = math.exp %[[OPERAND_IN]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @complex_exp
-func @complex_exp(%input: memref<2x2xcomplex<f32>>,
-                  %result: memref<2x2xcomplex<f32>>) {
-  "lmhlo.exponential"(%input, %result)
-      : (memref<2x2xcomplex<f32>>, memref<2x2xcomplex<f32>>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: complex<f32>, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[RESULT:.*]] = complex.exp %[[OPERAND_IN]] : complex<f32>
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : complex<f32>
-
-// -----
-
-// CHECK-LABEL: func @log
-func @log(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
-  "lmhlo.log"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[RESULT:.*]] = math.log %[[OPERAND_IN]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @complex_log
-func @complex_log(%input: memref<2x2xcomplex<f32>>,
-                  %result: memref<2x2xcomplex<f32>>) {
-  "lmhlo.log"(%input, %result) : (memref<2x2xcomplex<f32>>,
-                                  memref<2x2xcomplex<f32>>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: complex<f32>, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[RESULT:.*]] = complex.log %[[OPERAND_IN]] : complex<f32>
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : complex<f32>
-
-// -----
-
-// CHECK-LABEL: func @log1p
-func @log1p(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
-  "lmhlo.log_plus_one"(%input, %result) : (memref<2x2xf32>,
-                                           memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[RESULT:.*]] = math.log1p %[[OPERAND_IN]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @complex_log1p
-func @complex_log1p(%input: memref<2x2xcomplex<f32>>,
-                    %result: memref<2x2xcomplex<f32>>) {
-  "lmhlo.log_plus_one"(%input, %result) : (memref<2x2xcomplex<f32>>,
-                                           memref<2x2xcomplex<f32>>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: complex<f32>, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[RESULT:.*]] = complex.log1p %[[OPERAND_IN]] : complex<f32>
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : complex<f32>
-
-// -----
-
-// CHECK-LABEL: func @copy
-func @copy(%in: memref<2x4x8xf32>, %out: memref<2x4x8xf32>) {
-  "lmhlo.copy"(%in, %out) : (memref<2x4x8xf32>, memref<2x4x8xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   linalg.yield %[[OPERAND_IN]] : f32
-
-// -----
-
-// CHECK-LABEL: func @is_finite
-func @is_finite(%input: memref<2x2xf32>, %result: memref<2x2xi1>) {
-  "lmhlo.is_finite"(%input, %result) : (memref<2x2xf32>, memref<2x2xi1>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[POS_INF:.+]] = arith.constant 0x7F800000 : f32
-// CHECK-NEXT:   %[[ABS_X:.+]] = math.abs %[[OPERAND_IN]] : f32
-// CHECK-NEXT:   %[[RESULT:.+]] = arith.cmpf one, %[[ABS_X]], %[[POS_INF]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : i1
-
-// -----
-
-// CHECK-LABEL: func @float_cmp
-func @float_cmp(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>,
-                %result: memref<2x2xi1>) {
-  "lmhlo.compare"(%lhs, %rhs, %result) {comparison_direction = "EQ"}
-      : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xi1>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[LHS_IN:.*]]: f32, %[[RHS_IN:.*]]: f32, %[[RESULT_OUT:.*]]: i1):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.cmpf oeq, %[[LHS_IN]], %[[RHS_IN]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : i1
-
-// -----
-
-// CHECK-LABEL: func @int_cmp
-func @int_cmp(%lhs: memref<2x2xi32>, %rhs: memref<2x2xi32>,
-              %result: memref<2x2xi1>) {
-  "lmhlo.compare"(%lhs, %rhs, %result) {comparison_direction = "LT"}
-      : (memref<2x2xi32>, memref<2x2xi32>, memref<2x2xi1>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[LHS_IN:.*]]: i32, %[[RHS_IN:.*]]: i32, %[[RESULT_OUT:.*]]: i1):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.cmpi slt, %[[LHS_IN]], %[[RHS_IN]] : i32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : i1
-
-// -----
-
-// CHECK-LABEL: func @complex_cmp_eq
-func @complex_cmp_eq(%lhs: memref<2xcomplex<f32>>, %rhs: memref<2xcomplex<f32>>,
-                     %result: memref<2xi1>) {
-  "lmhlo.compare"(%lhs, %rhs, %result) {comparison_direction = "EQ"}
-      : (memref<2xcomplex<f32>>, memref<2xcomplex<f32>>, memref<2xi1>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[LHS_IN:.*]]: complex<f32>, %[[RHS_IN:.*]]: complex<f32>, %[[RESULT_OUT:.*]]: i1):
-// CHECK-NEXT:   %[[RESULT:.*]] = complex.eq %[[LHS_IN]], %[[RHS_IN]] : complex<f32>
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : i1
-
-// -----
-
-// CHECK-LABEL: func @complex_cmp_neq
-func @complex_cmp_neq(%lhs: memref<2xcomplex<f64>>, %rhs: memref<2xcomplex<f64>>,
-                      %result: memref<2xi1>) {
-  "lmhlo.compare"(%lhs, %rhs, %result) {comparison_direction = "NE"}
-      : (memref<2xcomplex<f64>>, memref<2xcomplex<f64>>, memref<2xi1>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[LHS_IN:.*]]: complex<f64>, %[[RHS_IN:.*]]: complex<f64>, %[[RESULT_OUT:.*]]: i1):
-// CHECK-NEXT:   %[[RESULT:.*]] = complex.neq %[[LHS_IN]], %[[RHS_IN]] : complex<f64>
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : i1
-
-// -----
-
-// CHECK-LABEL: func @complex_divide
-func @complex_divide(%lhs: memref<2xcomplex<f64>>, %rhs: memref<2xcomplex<f64>>,
-                     %result: memref<2xcomplex<f64>>) {
-  "lmhlo.divide"(%lhs, %rhs, %result)
-      : (memref<2xcomplex<f64>>, memref<2xcomplex<f64>>, memref<2xcomplex<f64>>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[LHS_IN:.*]]: complex<f64>, %[[RHS_IN:.*]]: complex<f64>, %[[RESULT_OUT:.*]]: complex<f64>):
-// CHECK-NEXT:   %[[RESULT:.*]] = complex.div %[[LHS_IN]], %[[RHS_IN]] : complex<f64>
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : complex<f64>
-
-// -----
-
-// CHECK-LABEL: func @complex_multiply
-func @complex_multiply(%lhs: memref<2xcomplex<f64>>, %rhs: memref<2xcomplex<f64>>,
-                       %result: memref<2xcomplex<f64>>) {
-  "lmhlo.multiply"(%lhs, %rhs, %result)
-      : (memref<2xcomplex<f64>>, memref<2xcomplex<f64>>, memref<2xcomplex<f64>>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[LHS_IN:.*]]: complex<f64>, %[[RHS_IN:.*]]: complex<f64>, %[[RESULT_OUT:.*]]: complex<f64>):
-// CHECK-NEXT:   %[[RESULT:.*]] = complex.mul %[[LHS_IN]], %[[RHS_IN]] : complex<f64>
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : complex<f64>
-
-// -----
-
-// CHECK-LABEL: func @select
-func @select(%pred: memref<2x2xi1>, %lhs: memref<2x2xf32>,
-             %rhs: memref<2x2xf32>, %result: memref<2x2xf32>) {
-  "lmhlo.select"(%pred, %lhs, %rhs, %result)
-    : (memref<2x2xi1>, memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[PRED_IN:.*]]: i1, %[[LHS_IN:.*]]: f32, %[[RHS_IN:.*]]: f32, %[[RESULT_OUT:.*]]: f32):
-// CHECK-NEXT:   %[[RESULT:.*]] = select %[[PRED_IN]], %[[LHS_IN]], %[[RHS_IN]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK-LABEL: func @iota
-func @iota(%out: memref<7x10xf32>) {
-  "lmhlo.iota"(%out) {iota_dimension = 1 : i64} : (memref<7x10xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-SAME: indexing_maps = [#[[RESULT_MAP]]]
-// CHECK-NEXT: ^bb0(%[[RESULT:.*]]: f32):
-// CHECK-NEXT:   %[[D1:.+]] = linalg.index 1
-// CHECK-NEXT:   %[[INT_CAST:.*]] = arith.index_cast %[[D1]] : index to i32
-// CHECK-NEXT:   %[[FLOAT_CAST:.*]] = arith.sitofp %[[INT_CAST]] : i32 to f32
-// CHECK-NEXT:   linalg.yield %[[FLOAT_CAST]] : f32
-
-// -----
-
-// CHECK-DAG: #[[OPERAND_MAP:.+]] = affine_map<(d0, d1, d2) -> ()>
-// CHECK-DAG: #[[RESULT_MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-// CHECK-LABEL: func @broadcast_scalar
-func @broadcast_scalar(%operand: memref<f32>, %result: memref<4x2x1xf32>) {
-  "lmhlo.broadcast"(%operand, %result) {
-    broadcast_sizes = dense<[4, 2, 1]> : tensor<3xi64>
-  } : (memref<f32>, memref<4x2x1xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-SAME: indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
-// CHECK-NEXT: ^bb0(%[[OPERAND:.+]]: f32, %{{.+}}: f32):
-// CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
-
-// -----
-
-// CHECK-DAG: #[[OPERAND_MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d3, d4, d5)>
-// CHECK-DAG: #[[RESULT_MAP:.+]] = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3, d4, d5)>
-// CHECK-LABEL: func @broadcast
-func @broadcast(%operand: memref<4x?x16xf32>,
-                %result: memref<4x2x1x4x?x16xf32>) {
-  "lmhlo.broadcast"(%operand, %result) {
-    broadcast_sizes = dense<[4, 2, 1]> : tensor<3xi64>
-  } : (memref<4x?x16xf32>, memref<4x2x1x4x?x16xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-SAME: indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
-// CHECK-NEXT: ^bb0(%[[OPERAND:.+]]: f32, %{{.+}}: f32):
-// CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
-
-// -----
-
-// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d4, d0, d2)>
-// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)>
-// CHECK-LABEL: func @dynamic_broadcast_in_dim
-func @dynamic_broadcast_in_dim(%operand: memref<?x?x?xf32>,
-                               %result: memref<?x?x?x?x?xf32>) {
-  "lmhlo.broadcast_in_dim"(%operand, %result) {
-    broadcast_dimensions = dense<[4,0,2]> : tensor<3xi64>
-  } : (memref<?x?x?xf32>, memref<?x?x?x?x?xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-SAME: indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
-// CHECK-NEXT: ^bb0(%[[OPERAND:.*]]: f32, %[[RESULT:.*]]: f32):
-// CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
-
-// -----
-
-// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1) -> (d0)>
-// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK-LABEL: func @static_broadcast_in_dim_no_expansion
-func @static_broadcast_in_dim_no_expansion(%operand: memref<5xf32>,
-                                           %result: memref<5x10xf32>) {
-  "lmhlo.broadcast_in_dim"(%operand, %result) {
-    broadcast_dimensions = dense<[0]> : tensor<1xi64>
-  } : (memref<5xf32>, memref<5x10xf32>) -> ()
-  return
-}
-// CHECK-NOT: linalg.{{.*}}shape
-// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
-// CHECK-NEXT: ^bb0(%[[OPERAND:.*]]: f32, %[[RESULT:.*]]: f32):
-// CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
-
-// -----
-
-// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1, d2) -> (d0)>
-// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
-// CHECK-LABEL: func @static_broadcast_in_dim_expansion
-func @static_broadcast_in_dim_expansion(%operand: memref<1x5xf32>,
-                                        %result: memref<5x10x100xf32>) {
-  "lmhlo.broadcast_in_dim"(%operand, %result) {
-    broadcast_dimensions = dense<[2, 0]> : tensor<2xi64>
-  } : (memref<1x5xf32>, memref<5x10x100xf32>) -> ()
-  return
-}
-// CHECK: %[[RESHAPED_ARG:.*]] = memref.collapse_shape %{{.*}} {{\[}}[0, 1]]
-// CHECK-SAME:                   memref<1x5xf32> into memref<5xf32>
-// CHECK: linalg.generic {{{.*}}indexing_maps =
-// CHECK-SAME:       [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
-// CHECK-SAME:   ins(%[[RESHAPED_ARG]] :
-// CHECK-NEXT: ^bb0(%[[OPERAND:.*]]: f32, %[[RESULT:.*]]: f32):
-// CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
-
-// -----
-
-// CHECK-DAG: #[[RESULT_MAP_0:.*]] = affine_map<(d0, d1) -> ()>
-// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK-LABEL: func @static_broadcast_in_dim_scalar
-func @static_broadcast_in_dim_scalar(%operand: memref<f32>,
-                                     %result: memref<5x10xf32>) {
-  "lmhlo.broadcast_in_dim"(%operand, %result) {
-    broadcast_dimensions = dense<[]> : tensor<0xi64>
-  } : (memref<f32>, memref<5x10xf32>) -> ()
-  return
-}
-// CHECK-NOT: linalg.{{.*}}shape
-// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[RESULT_MAP_0]], #[[RESULT_MAP]]]
-// CHECK-NEXT: ^bb0(%[[CONST:.*]]: f32, %[[RESULT:.*]]: f32):
-// CHECK-NEXT:   linalg.yield %[[CONST]] : f32
-
-// -----
-
-// CHECK-DAG: #[[OPERAND_MAP:.+]] = affine_map<(d0, d1) -> (d0)>
-// CHECK-DAG: #[[RESULT_MAP:.+]] = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK-LABEL: func @static_broadcast_in_dim_with_one_to_one
-func @static_broadcast_in_dim_with_one_to_one(%operand: memref<1xf32>,
-                                              %result: memref<1x5xf32>) {
-  "lmhlo.broadcast_in_dim"(%operand, %result) {
-    broadcast_dimensions = dense<[0]> : tensor<1xi64>
-  } : (memref<1xf32>, memref<1x5xf32>) -> ()
-  return
-}
-// CHECK-NOT: linalg.{{.*}}shape
-// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
-// CHECK-NEXT: ^bb0(%[[OPERAND:.+]]: f32, %{{.+}}: f32):
-// CHECK-NEXT:   linalg.yield %[[OPERAND]] : f32
-
-// -----
-
-// CHECK-DAG: #[[RESULT_MAP:.+]] = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK-LABEL: func @static_broadcast_in_dim_with_one_to_many
-func @static_broadcast_in_dim_with_one_to_many(%operand: memref<1xf32>,
-                                               %result: memref<5x5xf32>) {
-  "lmhlo.broadcast_in_dim"(%operand, %result) {
-    broadcast_dimensions = dense<[1]> : tensor<1xi64>
-  } : (memref<1xf32>, memref<5x5xf32>) -> ()
-  return
-}
-// CHECK-NOT: linalg.{{.*}}shape
-// CHECK: %[[C0:.*]] = arith.constant 0 : index
-// CHECK: %[[VALUE:.*]] = memref.load %{{.*}}[[C0]]
-// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[RESULT_MAP]]]
-// CHECK-NEXT: ^bb0(%{{.+}}: f32):
-// CHECK-NEXT:   linalg.yield %[[VALUE]] : f32
-
-// -----
-
-// CHECK-LABEL: func @constant
-func @constant(%value: memref<i32>) {
-  "lmhlo.constant"(%value) {
-    value = dense<10> : tensor<i32>
-  } : (memref<i32>) -> ()
-  return
-}
-// CHECK: %[[CONSTANT:.*]] = arith.constant 10 : i32
-// CHECK: affine.store %[[CONSTANT]], %{{.*}}[] : memref<i32>
-
-// -----
-
-// CHECK-LABEL: func @absf
-func @absf(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
-  "lmhlo.abs"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[RESULT:.*]] = math.abs %[[OPERAND_IN]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @complex_abs
-func @complex_abs(%input: memref<2x2xcomplex<f32>>, %result: memref<2x2xf32>) {
-  "lmhlo.abs"(%input, %result)
-      : (memref<2x2xcomplex<f32>>, memref<2x2xf32>) -> ()
-  return
-}
-
-// CHECK:      linalg.generic
-// CHECK-NEXT: ^bb0(%[[CPLX_IN:.*]]: complex<f32>, %[[ABS_OUT:.*]]: f32):
-// CHECK-NEXT:   %[[ABS:.*]] = complex.abs %[[CPLX_IN:.*]] : complex<f32>
-// CHECK-NEXT:   linalg.yield %[[ABS]] : f32
-
-// -----
-
-// CHECK-LABEL: func @absi
-func @absi(%input: memref<2x2xi32>,
-          %result: memref<2x2xi32>) {
-  "lmhlo.abs"(%input, %result) : (memref<2x2xi32>, memref<2x2xi32>) -> ()
-  return
-}
-
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i32, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[L0:.*]] = arith.constant 0 : i32
-// CHECK-NEXT:   %[[L1:.*]] = arith.cmpi sge, %[[OPERAND_IN]], %[[L0]] : i32
-// CHECK-NEXT:   %[[L2:.*]] = arith.subi %[[L0]], %[[OPERAND_IN]] : i32
-// CHECK-NEXT:   %[[RESULT:.*]] = select %[[L1]], %[[OPERAND_IN]], %[[L2]] : i32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : i32
-
-// -----
-
-// CHECK-LABEL: func @ceil
-func @ceil(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
-  "lmhlo.ceil"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[RESULT:.*]] = math.ceil %[[OPERAND_IN]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @bitcast_convert
-func @bitcast_convert(%input: memref<2x2xi32>, %result: memref<2x2xf32>) {
-  "lmhlo.bitcast_convert"(%input, %result) : (memref<2x2xi32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i32, %[[RESULT_OUT:.*]]: f32):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.bitcast %[[OPERAND_IN]] : i32 to f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @convert_i1_to_f32
-func @convert_i1_to_f32(%input: memref<2x2xi1>, %result: memref<2x2xf32>) {
-  "lmhlo.convert"(%input, %result) : (memref<2x2xi1>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i1, %[[RESULT_OUT:.*]]: f32):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.uitofp %[[OPERAND_IN]] : i1 to f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @convert_ui8_to_f32
-func @convert_ui8_to_f32(%input: memref<2x2xui8>, %result: memref<2x2xf32>) {
-  "lmhlo.convert"(%input, %result) : (memref<2x2xui8>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: unrealized_conversion_cast %arg0 : memref<2x2xui8> to memref<2x2xi8>
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i8, %[[RESULT_OUT:.*]]: f32):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.uitofp %[[OPERAND_IN]] : i8 to f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @convert_i1_to_i32
-func @convert_i1_to_i32(%input: memref<2x2xi1>, %result: memref<2x2xi32>) {
-  "lmhlo.convert"(%input, %result) : (memref<2x2xi1>, memref<2x2xi32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i1, %[[RESULT_OUT:.*]]: i32):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.extui %[[OPERAND_IN]] : i1 to i32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : i32
-
-// -----
-
-// CHECK-LABEL: func @convert_i32_to_f32
-func @convert_i32_to_f32(%input: memref<2x2xi32>, %result: memref<2x2xf32>) {
-  "lmhlo.convert"(%input, %result) : (memref<2x2xi32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i32, %[[RESULT_OUT:.*]]: f32):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.sitofp %[[OPERAND_IN]] : i32 to f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @convert_ui32_to_f32
-func @convert_ui32_to_f32(%input: memref<2x2xui32>, %result: memref<2x2xf32>) {
-  "lmhlo.convert"(%input, %result) : (memref<2x2xui32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: unrealized_conversion_cast %arg0 : memref<2x2xui32> to memref<2x2xi32>
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i32, %[[RESULT_OUT:.*]]: f32):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.uitofp %[[OPERAND_IN]] : i32 to f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @convert_i16_to_i32
-func @convert_i16_to_i32(%input: memref<2x2xi16>,
-          %result: memref<2x2xi32>) {
-  "lmhlo.convert"(%input, %result) : (memref<2x2xi16>, memref<2x2xi32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i16, %[[RESULT_OUT:.*]]: i32):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.extsi %[[OPERAND_IN]] : i16 to i32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : i32
-
-// -----
-
-// CHECK-LABEL: func @convert_ui16_to_i32
-func @convert_ui16_to_i32(%input: memref<2x2xui16>,
-          %result: memref<2x2xi32>) {
-  "lmhlo.convert"(%input, %result) : (memref<2x2xui16>, memref<2x2xi32>) -> ()
-  return
-}
-// CHECK: unrealized_conversion_cast %arg0 : memref<2x2xui16> to memref<2x2xi16>
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i16, %[[RESULT_OUT:.*]]: i32):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.extui %[[OPERAND_IN]] : i16 to i32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : i32
-
-// -----
-
-// CHECK-LABEL: func @convert_i32_to_i16
-func @convert_i32_to_i16(%input: memref<2x2xi32>, %result: memref<2x2xi16>) {
-  "lmhlo.convert"(%input, %result) : (memref<2x2xi32>, memref<2x2xi16>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i32, %[[RESULT_OUT:.*]]: i16):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.trunci %[[OPERAND_IN]] : i32 to i16
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : i16
-
-// -----
-
-// CHECK-LABEL: func @convert_f32_to_f64
-func @convert_f32_to_f64(%input: memref<2x2xf32>, %result: memref<2x2xf64>) {
-  "lmhlo.convert"(%input, %result) : (memref<2x2xf32>, memref<2x2xf64>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]: f64):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.extf %[[OPERAND_IN]] : f32 to f64
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f64
-
-// -----
-
-// CHECK-LABEL: func @convert_f64_to_f32
-func @convert_f64_to_f32(%input: memref<2x2xf64>, %result: memref<2x2xf32>) {
-  "lmhlo.convert"(%input, %result) : (memref<2x2xf64>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f64, %[[RESULT_OUT:.*]]: f32):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.truncf %[[OPERAND_IN]] : f64 to f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @convert_i32_to_i32
-func @convert_i32_to_i32(%input: memref<2x2xi32>, %result: memref<2x2xi32>) {
-  "lmhlo.convert"(%input, %result) : (memref<2x2xi32>, memref<2x2xi32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i32, %[[RESULT_OUT:.*]]: i32):
-// CHECK-NEXT: linalg.yield %[[OPERAND_IN]] : i32
-
-// -----
-
-// CHECK-LABEL: func @convert_f32_to_f32
-func @convert_f32_to_f32(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
-  "lmhlo.convert"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]: f32):
-// CHECK-NEXT: linalg.yield %[[OPERAND_IN]] : f32
-
-// -----
-
-// CHECK-LABEL: func @convert_i32_to_i1
-func @convert_i32_to_i1(%input: memref<2x2xi32>, %result: memref<2x2xi1>) {
-  "lmhlo.convert"(%input, %result)
-      : (memref<2x2xi32>, memref<2x2xi1>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i32, %[[RESULT_OUT:.*]]: i1):
-// CHECK-NEXT:   %[[ZERO:.*]] = arith.constant 0 : i32
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.cmpi ne, %[[OPERAND_IN]], %[[ZERO]] : i32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : i1
-
-// -----
-
-// CHECK-LABEL: func @convert_ui32_to_i1
-func @convert_ui32_to_i1(%input: memref<2x2xui32>, %result: memref<2x2xi1>) {
-  "lmhlo.convert"(%input, %result)
-      : (memref<2x2xui32>, memref<2x2xi1>) -> ()
-  return
-}
-// CHECK: unrealized_conversion_cast %arg0 : memref<2x2xui32> to memref<2x2xi32>
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i32, %[[RESULT_OUT:.*]]: i1):
-// CHECK-NEXT:   %[[ZERO:.*]] = arith.constant 0 : i32
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.cmpi ne, %[[OPERAND_IN]], %[[ZERO]] : i32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : i1
-
-// -----
-
-// CHECK-LABEL: func @convert_f32_to_i1
-func @convert_f32_to_i1(%input: memref<2x2xf32>, %result: memref<2x2xi1>) {
-  "lmhlo.convert"(%input, %result)
-      : (memref<2x2xf32>, memref<2x2xi1>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]: i1):
-// CHECK-NEXT:   %[[ZERO:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.cmpf une, %[[OPERAND_IN]], %[[ZERO]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : i1
-
-// -----
-
-// CHECK-LABEL: func @convert_f32_to_i32
-func @convert_f32_to_i32(%input: memref<2x2xf32>, %result: memref<2x2xi32>) {
-  "lmhlo.convert"(%input, %result)
-      : (memref<2x2xf32>, memref<2x2xi32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]: i32):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.fptosi %[[OPERAND_IN]] : f32 to i32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : i32
-
-// -----
-
-// CHECK-LABEL: func @cos
-func @cos(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
-  "lmhlo.cosine"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[RESULT:.*]] = math.cos %[[OPERAND_IN]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @sin
-func @sin(%input: memref<2x2xf32>,
-          %result: memref<2x2xf32>) {
-  "lmhlo.sine"(%input, %result)
-      : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[RESULT:.*]] = math.sin %[[OPERAND_IN]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @floor
-func @floor(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
-  "lmhlo.floor"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[RESULT:.*]] = math.floor %[[OPERAND_IN]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @negf
-func @negf(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
-  "lmhlo.negate"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.negf %[[OPERAND_IN]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @complex_neg
-func @complex_neg(%input: memref<2x2xcomplex<f32>>,
-                  %result: memref<2x2xcomplex<f32>>) {
-  "lmhlo.negate"(%input, %result) : (memref<2x2xcomplex<f32>>,
-                                     memref<2x2xcomplex<f32>>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: complex<f32>, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[RESULT:.*]] = complex.neg %[[OPERAND_IN]] : complex<f32>
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : complex<f32>
-
-// -----
-
-// -----
-
-// CHECK-LABEL: func @negi
-func @negi(%input: memref<2x2xi32>, %result: memref<2x2xi32>) {
-  "lmhlo.negate"(%input, %result) : (memref<2x2xi32>, memref<2x2xi32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i32, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[L0:.*]] = arith.constant 0 : i32
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.subi %[[L0]], %[[OPERAND_IN]] : i32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : i32
-
-// -----
-
-// CHECK-LABEL: func @not
-func @not(%input: memref<2x2xi64>, %result: memref<2x2xi64>) {
-  "lmhlo.not"(%input, %result) : (memref<2x2xi64>, memref<2x2xi64>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i64, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[N1:.*]] = arith.constant -1 : i64
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.xori %[[N1]], %[[OPERAND_IN]] : i64
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : i64
-
-// -----
-
-// CHECK-LABEL: func @rem
-func @remainder(%lhs: memref<2x2xf32>, %rhs: memref<2x2xf32>,
-                %result: memref<2x2xf32>) {
-  "lmhlo.remainder"(%lhs, %rhs, %result)
-      : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[LHS_IN:.*]]: f32, %[[RHS_IN:.*]]: f32, %[[RESULT:.*]]: f32):
-// CHECK-NEXT:   %[[RESULT:.*]] = arith.remf %[[LHS_IN]], %[[RHS_IN]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @rsqrt
-func @rsqrt(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
-  "lmhlo.rsqrt"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[RESULT:.*]] = math.rsqrt %[[OPERAND_IN]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @sign
-func @sign(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
-  "lmhlo.sign"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[CST_0:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK-NEXT:   %[[NE_0:.*]] = arith.cmpf one, %[[OPERAND_IN]], %[[CST_0]] : f32
-// CHECK-NEXT:   %[[NE_0_FLOAT:.*]] = arith.uitofp %[[NE_0]] : i1 to f32
-// CHECK-NEXT:   %[[SIGN:.*]] = math.copysign %[[NE_0_FLOAT]], %[[OPERAND_IN]] : f32
-// CHECK-NEXT:   %[[CMP:.*]] = arith.cmpf uno, %[[OPERAND_IN]], %[[OPERAND_IN]] : f32
-// CHECK-NEXT:   %[[RESULT:.*]] = select %[[CMP]], %[[OPERAND_IN]], %[[SIGN]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @sign_bf16
-func @sign_bf16(%input: memref<2x2xbf16>, %result: memref<2x2xbf16>) {
-  "lmhlo.sign"(%input, %result) : (memref<2x2xbf16>, memref<2x2xbf16>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: bf16, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[CST_0:.*]] = arith.constant 0.000000e+00 : bf16
-// CHECK-NEXT:   %[[NE_0:.*]] = arith.cmpf one, %[[OPERAND_IN]], %[[CST_0]] : bf16
-// CHECK-NEXT:   %[[NE_0_FLOAT:.*]] = arith.uitofp %[[NE_0]] : i1 to bf16
-// CHECK-NEXT:   %[[SIGN:.*]] = math.copysign %[[NE_0_FLOAT]], %[[OPERAND_IN]] : bf16
-// CHECK-NEXT:   %[[CMP:.*]] = arith.cmpf uno, %[[OPERAND_IN]], %[[OPERAND_IN]] : bf16
-// CHECK-NEXT:   %[[RESULT:.*]] = select %[[CMP]], %[[OPERAND_IN]], %[[SIGN]] : bf16
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : bf16
-
-// -----
-
-// CHECK-LABEL: func @sign_i16
-func @sign_i16(%input: memref<2x2xi16>, %result: memref<2x2xi16>) {
-  "lmhlo.sign"(%input, %result) : (memref<2x2xi16>, memref<2x2xi16>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: i16, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[C0:.*]] = arith.constant 0 : i16
-// CHECK-NEXT:   %[[C15:.*]] = arith.constant 15 : i16
-// CHECK-NEXT:   %[[C1:.*]] = arith.constant 1 : i16
-// CHECK-NEXT:   %[[CMP:.*]] = arith.cmpi eq, %[[OPERAND_IN]], %[[C0]] : i16
-// CHECK-NEXT:   %[[ASHR:.*]] = arith.shrsi %[[OPERAND_IN]], %[[C15]] : i16
-// CHECK-NEXT:   %[[OR:.*]] = arith.ori %[[ASHR]], %[[C1]] : i16
-// CHECK-NEXT:   %[[RESULT:.*]] = select %[[CMP]], %[[C0]], %[[OR]] : i16
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : i16
-
-// -----
-
-// CHECK-LABEL: func @sign_complex
-func @sign_complex(%input: memref<2x2xcomplex<f32>>,
-                   %result: memref<2x2xcomplex<f32>>) {
-  "lmhlo.sign"(%input, %result) : (memref<2x2xcomplex<f32>>,
-                                   memref<2x2xcomplex<f32>>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: complex<f32>, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[RESULT:.*]] = complex.sign %[[OPERAND_IN]] : complex<f32>
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : complex<f32>
-
-// -----
-
-// CHECK-LABEL: func @sqrt
-func @sqrt(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
-  "lmhlo.sqrt"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[RESULT:.*]] = math.sqrt %[[OPERAND_IN]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @tanh
-func @tanh(%input: memref<2x2xf32>, %result: memref<2x2xf32>) {
-  "lmhlo.tanh"(%input, %result) : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic
-// CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32, %[[RESULT_OUT:.*]]):
-// CHECK-NEXT:   %[[RESULT:.*]] = math.tanh %[[OPERAND_IN]] : f32
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : f32
-
-// -----
-
-// CHECK-LABEL: func @complex
-func @complex(%real: memref<2x2xf32>,
-              %imag: memref<2x2xf32>,
-              %cplx: memref<2x2xcomplex<f32>>) {
-  "lmhlo.complex"(%real, %imag, %cplx)
-      : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xcomplex<f32>>) -> ()
-  return
-}
-// CHECK:      linalg.generic
-// CHECK-NEXT: ^bb0(%[[RE:.*]]: f32, %[[IM:.*]]: f32, %[[CP:.*]]: complex<f32>):
-// CHECK-NEXT:   %[[RESULT:.*]] = complex.create %[[RE]], %[[IM]] : complex<f32>
-// CHECK-NEXT:   linalg.yield %[[RESULT]] : complex<f32>
-
-// -----
-
-// CHECK-LABEL: func @real
-func @real(%cplx: memref<2x2xcomplex<f32>>,
-           %real: memref<2x2xf32>) {
-  "lmhlo.real"(%cplx, %real)
-      : (memref<2x2xcomplex<f32>>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK:      linalg.generic
-// CHECK-NEXT: ^bb0(%[[CPLX_IN:.*]]: complex<f32>, %[[REAL_OUT:.*]]: f32):
-// CHECK-NEXT:   %[[REAL:.*]] = complex.re %[[CPLX_IN:.*]] : complex<f32>
-// CHECK-NEXT:   linalg.yield %[[REAL]] : f32
-
-// -----
-
-// CHECK-LABEL: func @imag
-func @imag(%cplx: memref<2x2xcomplex<f32>>,
-           %imag: memref<2x2xf32>) {
-  "lmhlo.imag"(%cplx, %imag)
-      : (memref<2x2xcomplex<f32>>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK:      linalg.generic
-// CHECK-NEXT: ^bb0(%[[CPLX_IN:.*]]: complex<f32>, %[[IMAG_OUT:.*]]: f32):
-// CHECK-NEXT:   %[[IMAG:.*]] = complex.im %[[CPLX_IN:.*]] : complex<f32>
-// CHECK-NEXT:   linalg.yield %[[IMAG]] : f32
-
-// -----
-
-// CHECK: func @slice(%[[IN:.*]]: memref<?x?xf32>, %[[OUT:.*]]: memref<?x?xf32>)
-func @slice(%operand: memref<?x?xf32>, %result: memref<?x?xf32>) {
-  "lmhlo.slice"(%operand, %result) {
-    start_indices = dense<[0,1]> : tensor<2xi64>,
-    limit_indices = dense<[2,3]> : tensor<2xi64>,
-    strides = dense<[1,1]> : tensor<2xi64>
-  } : (memref<?x?xf32>, memref<?x?xf32>) -> ()
-  return
-}
-// CHECK: %[[RESULT:.*]] = memref.subview %[[IN]][0, 1] [2, 2] [1, 1] : memref<?x?xf32> to memref<2x2xf32, #{{.*}}>
-// CHECK: linalg.copy(%[[RESULT]], %[[OUT]])
-
-// -----
-
-// CHECK: func @slice_with_strides(%[[IN:.*]]: memref<?xf32>, %[[OUT:.*]]: memref<?xf32>)
-func @slice_with_strides(%operand: memref<?xf32>, %result: memref<?xf32>) {
-  "lmhlo.slice"(%operand, %result) {
-    limit_indices = dense<12> : tensor<1xi64>,
-    start_indices = dense<0> : tensor<1xi64>,
-    strides = dense<2> : tensor<1xi64>
-  } : (memref<?xf32>, memref<?xf32>) -> ()
-  return
-}
-// CHECK: %[[RESULT:.*]] = memref.subview %[[IN]][0] [6] [2] : memref<?xf32> to memref<6xf32, #{{.*}}>
-// CHECK: linalg.copy(%[[RESULT]], %[[OUT]])
-
-// -----
-
-// CHECK-LABEL: func @reshape_3D_2D
-func @reshape_3D_2D(%arg0: memref<12x1x42xi32>, %arg1 : memref<12x42xi32>) {
-  "lmhlo.reshape"(%arg0, %arg1)
-    : (memref<12x1x42xi32>, memref<12x42xi32>) -> ()
-  return
-}
-// CHECK: memref.collapse_shape %{{.*}} {{\[}}[0, 1], [2]]
-// CHECK-NEXT: linalg.copy
-
-// -----
-
-// CHECK-LABEL: func @reshape_4D_2D
-func @reshape_4D_2D(%arg0: memref<12x42x1x1xi32>, %arg1 : memref<12x42xi32>) {
-  "lmhlo.reshape"(%arg0, %arg1)
-    : (memref<12x42x1x1xi32>, memref<12x42xi32>) -> ()
-  return
-}
-// CHECK: memref.collapse_shape %{{.*}} {{\[}}[0], [1, 2, 3]]
-// CHECK-NEXT: linalg.copy
-
-// -----
-
-// CHECK-LABEL: func @reshape_2D_4D
-func @reshape_2D_4D(%arg0: memref<12x42xi32>, %arg1 : memref<12x1x42x1xi32>) {
-  "lmhlo.reshape"(%arg0, %arg1)
-    : (memref<12x42xi32>, memref<12x1x42x1xi32>) -> ()
-  return
-}
-// CHECK: memref.expand_shape %{{.*}} {{\[}}[0, 1], [2, 3]]
-// CHECK-NEXT: linalg.copy
-
-// -----
-
-// CHECK-LABEL: func @reshape_3D_4D
-func @reshape_3D_4D(%arg0: memref<1x49x16xf32>, %arg1: memref<1x784x1x1xf32>) {
-  "lmhlo.reshape"(%arg0, %arg1)
-   : (memref<1x49x16xf32>, memref<1x784x1x1xf32>) -> ()
-  return
-}
-// CHECK: memref.collapse_shape %{{.*}} {{\[}}[0, 1, 2]]
-// CHECK: memref.expand_shape %{{.*}} {{\[}}[0, 1, 2, 3]]
-// CHECK: linalg.copy
-
-// -----
-
-// CHECK-LABEL: func @reshape_4D_3D
-func @reshape_4D_3D(%arg0: memref<1x8x10x3xf32>, %arg1: memref<1x240x1xf32>) {
-  "lmhlo.reshape"(%arg0, %arg1)
-   : (memref<1x8x10x3xf32>, memref<1x240x1xf32>) -> ()
-  return
-}
-// CHECK: memref.collapse_shape %{{.*}} {{\[}}[0, 1, 2, 3]]
-// CHECK: memref.expand_shape %{{.*}} {{\[}}[0, 1, 2]]
-// CHECK: linalg.copy
-
-// -----
-
-// CHECK-LABEL: func @reshape1_4D_4D
-func @reshape1_4D_4D(%arg0: memref<4x512x1x1xi32>,
-                     %arg1: memref<1x4x1x512xi32>) {
-  "lmhlo.reshape"(%arg0, %arg1)
-   : (memref<4x512x1x1xi32>, memref<1x4x1x512xi32>) -> ()
-  return
-}
-// CHECK: memref.collapse_shape %{{.*}} {{\[}}[0, 1, 2, 3]]
-// CHECK: memref.expand_shape %{{.*}} {{\[}}[0, 1, 2, 3]]
-
-// -----
-
-// CHECK-LABEL: func @reshape2_4D_4D
-func @reshape2_4D_4D(%arg0: memref<4x1x1x1024xi32>,
-                     %arg1: memref<4x1024x1x1xi32>) {
-  "lmhlo.reshape"(%arg0, %arg1)
-   : (memref<4x1x1x1024xi32>, memref<4x1024x1x1xi32>) -> ()
-  return
-}
-// CHECK: memref.collapse_shape %{{.*}} {{\[}}[0, 1, 2, 3]]
-// CHECK: memref.expand_shape %{{.*}} {{\[}}[0, 1, 2, 3]]
-
-// -----
-
-// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 2)>
-// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK-LABEL: func @reverse
-func @reverse(%arg0: memref<2x3xf32>, %arg1: memref<2x3xf32>) {
-  "lmhlo.reverse"(%arg0, %arg1) {
-    dimensions = dense<1> : tensor<1xi64>
-  } : (memref<2x3xf32>, memref<2x3xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
-
-// -----
-
-// CHECK-DAG: #[[TRANSPOSE_INPUT_MAP:.*]] = affine_map<(d0, d1) -> (d1, d0)>
-// CHECK-DAG: #[[TRANSPOSE_OUTPUT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK-LABEL: func @transpose
-func @transpose(%arg0: memref<2x2xf32>, %arg1: memref<2x2xf32>) {
-  "lmhlo.transpose"(%arg0, %arg1) {
-    permutation = dense<[1, 0]> : tensor<2xi64>
-  } : (memref<2x2xf32>, memref<2x2xf32>) -> ()
-  return
-}
-// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[TRANSPOSE_INPUT_MAP]], #[[TRANSPOSE_OUTPUT_MAP]]]
-
-// -----
-
-// CHECK-DAG: #[[REDUCE_INPUT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK-DAG: #[[REDUCE_OUTPUT_MAP:.*]] = affine_map<(d0, d1) -> (d0)>
-// CHECK-LABEL: func @reduce_add
-func @reduce_add(%arg: memref<100x10xf32>,
-             %init: memref<f32>,
-             %result: memref<100xf32>) {
-  "lmhlo.reduce"(%arg, %init, %result) ( {
-    ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
-      "lmhlo.add"(%lhs, %rhs, %res)
-        : (memref<f32>, memref<f32>, memref<f32>) -> ()
-      "lmhlo.terminator"() : () -> ()
-    } ) {dimensions = dense<[1]> : tensor<1xi64>}
-      : (memref<100x10xf32>, memref<f32>, memref<100xf32>) -> ()
-  return
-}
-// CHECK: %[[INIT_VAL:.*]] = memref.load %arg1[] : memref<f32>
-// CHECK: linalg.fill(%[[INIT_VAL]], %arg2)
-// CHECK: linalg.generic {
-// CHECK-SAME: indexing_maps = [#[[REDUCE_INPUT_MAP]], #[[REDUCE_OUTPUT_MAP]]],
-// CHECK-SAME: iterator_types = ["parallel", "reduction"]}
-// CHECK-SAME: ins(%arg0 : memref<100x10xf32>) outs(%arg2 : memref<100xf32>) {
-// CHECK: memref.alloca
-// CHECK-NEXT: memref.alloca
-// CHECK-NEXT: memref.alloca
-// CHECK-NEXT: memref.store
-// CHECK-NEXT: memref.store
-// CHECK-NEXT: memref.load
-// CHECK-NEXT: memref.load
-// CHECK-NEXT: addf
-// CHECK-NEXT: memref.store
-// CHECK-NEXT: memref.load
-// CHECK-NEXT: linalg.yield
-// CHECK-NEXT: }
-
-// -----
-
-// CHECK-DAG: #[[REDUCE_INPUT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK-DAG: #[[REDUCE_OUTPUT_MAP:.*]] = affine_map<(d0, d1) -> (d0)>
-// CHECK-LABEL: func @reduce_maximum
-func @reduce_maximum(%arg: memref<100x10xf32>,
-             %init: memref<f32>,
-             %result: memref<100xf32>) {
-  "lmhlo.reduce"(%arg, %init, %result) ( {
-    ^bb0(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
-      "lmhlo.maximum"(%lhs, %rhs, %res)
-        : (memref<f32>, memref<f32>, memref<f32>) -> ()
-      "lmhlo.terminator"() : () -> ()
-    } ) {dimensions = dense<[1]> : tensor<1xi64>}
-      : (memref<100x10xf32>, memref<f32>, memref<100xf32>) -> ()
-  return
-}
-// CHECK: %[[INIT_VAL:.*]] = memref.load %arg1[] : memref<f32>
-// CHECK: linalg.fill(%[[INIT_VAL]], %arg2)
-// CHECK: linalg.generic {
-// CHECK-SAME: indexing_maps = [#[[REDUCE_INPUT_MAP]], #[[REDUCE_OUTPUT_MAP]]],
-// CHECK-SAME: iterator_types = ["parallel", "reduction"]}
-// CHECK-SAME: ins(%arg0 : memref<100x10xf32>) outs(%arg2 : memref<100xf32>) {
-// CHECK: memref.alloca
-// CHECK-NEXT: memref.alloca
-// CHECK-NEXT: memref.alloca
-// CHECK-NEXT: memref.store
-// CHECK-NEXT: memref.store
-// CHECK-NEXT: memref.load
-// CHECK-NEXT: memref.load
-// CHECK: arith.maxf
-// CHECK: memref.store
-// CHECK-NEXT: memref.load
-// CHECK-NEXT: linalg.yield
-// CHECK-NEXT: }
-
-// -----
-
-// CHECK-DAG: #[[REDUCE_INPUT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
-// CHECK-DAG: #[[REDUCE_OUTPUT_MAP:.*]] = affine_map<(d0, d1) -> (d0)>
-// CHECK-LABEL: func @reduce_multiple_operand
-module  {
-  func @reduce_multiple_operand(%arg0: memref<1x8xf32>, %arg1: memref<1x8xi32>,
-                                %arg2: memref<f32>, %arg3: memref<i32>,
-                                %arg4: memref<1xf32>, %arg5: memref<1xi32>) {
-    "lmhlo.reduce"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5) ( {
-    ^bb0(%arg6: memref<f32>, %arg7: memref<i32>, %arg8: memref<f32>,
-         %arg9: memref<i32>, %arg10: memref<f32>, %arg11: memref<i32>):
-      "lmhlo.add"(%arg6, %arg8, %arg10) : (memref<f32>, memref<f32>, memref<f32>) -> ()
-      "lmhlo.add"(%arg7, %arg9, %arg11) : (memref<i32>, memref<i32>, memref<i32>) -> ()
-      "lmhlo.terminator"() : () -> ()
-    }) {dimensions = dense<1> : tensor<1xi64>} 
-       : (memref<1x8xf32>, memref<1x8xi32>, memref<f32>, memref<i32>, memref<1xf32>, memref<1xi32>) -> ()
-    return
-  }
-}
-// CHECK: %[[INIT_VAL0:.*]] = memref.load %arg2[] : memref<f32>
-// CHECK: linalg.fill(%[[INIT_VAL0]], %arg4) : f32, memref<1xf32> 
-// CHECK: %[[INIT_VAL1:.*]] = memref.load %arg3[] : memref<i32>
-// CHECK: linalg.fill(%[[INIT_VAL1]], %arg5) : i32, memref<1xi32> 
-// CHECK: linalg.generic {
-// CHECK-SAME: indexing_maps = [#[[REDUCE_INPUT_MAP]], #[[REDUCE_INPUT_MAP]], #[[REDUCE_OUTPUT_MAP]], #[[REDUCE_OUTPUT_MAP]]],
-// CHECK-SAME: iterator_types = ["parallel", "reduction"]}
-// CHECK-SAME: ins(%arg0, %arg1 : memref<1x8xf32>, memref<1x8xi32>) outs(%arg4, %arg5 : memref<1xf32>, memref<1xi32>) {
-// CHECK: %[[lhsf:[0-9]+]] = memref.alloca() : memref<f32>
-// CHECK: %[[lhsi:[0-9]+]] = memref.alloca() : memref<i32>
-// CHECK: %[[rhsf:[0-9]+]] = memref.alloca() : memref<f32>
-// CHECK: %[[rhsi:[0-9]+]] = memref.alloca() : memref<i32>
-// CHECK: %[[outf:[0-9]+]] = memref.alloca() : memref<f32>
-// CHECK: %[[outi:[0-9]+]] = memref.alloca() : memref<i32>
-// CHECK: memref.store %arg[[#%u,idx:]], %[[lhsf]][] : memref<f32>
-// CHECK: memref.store %arg[[#idx+1]], %[[lhsi]][] : memref<i32>
-// CHECK: memref.store %arg[[#idx+2]], %[[rhsf]][] : memref<f32>
-// CHECK: memref.store %arg[[#idx+3]], %[[rhsi]][] : memref<i32>
-// CHECK: %[[lhsf_tmp:[0-9]+]] = memref.load %[[lhsf]][] : memref<f32>
-// CHECK: %[[rhsf_tmp:[0-9]+]] = memref.load %[[rhsf]][] : memref<f32>
-// CHECK: %[[outf_tmp:[0-9]+]] = arith.addf %[[lhsf_tmp]], %[[rhsf_tmp]] : f32
-// CHECK: memref.store %[[outf_tmp]], %[[outf]][] : memref<f32>
-// CHECK: %[[lhsi_tmp:[0-9]+]] = memref.load %[[lhsi]][] : memref<i32>
-// CHECK: %[[rhsi_tmp:[0-9]+]] = memref.load %[[rhsi]][] : memref<i32>
-// CHECK: %[[outi_tmp:[0-9]+]] = arith.addi %[[lhsi_tmp]], %[[rhsi_tmp]] : i32
-// CHECK: memref.store %[[outi_tmp]], %[[outi]][] : memref<i32>
-// CHECK: %[[scalarf:[0-9]+]] = memref.load %[[outf]][] : memref<f32>
-// CHECK: %[[scalari:[0-9]+]] = memref.load %[[outi]][] : memref<i32>
-// CHECK: linalg.yield %[[scalarf]], %[[scalari]] : f32, i32
diff --git a/tensorflow/compiler/mlir/hlo/tests/reshape-simplifier.mlir b/tensorflow/compiler/mlir/hlo/tests/reshape-simplifier.mlir
index c4f4bec6436c4a..ce741b801daa2b 100644
--- a/tensorflow/compiler/mlir/hlo/tests/reshape-simplifier.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/reshape-simplifier.mlir
@@ -168,6 +168,31 @@ func @redundant_cstr_reshapable(%arg0 : tensor<?x8x?x64xf32>)
 
 // -----
 
+// CHECK-LABEL: @dynamic_reshape_to_collapse_shape
+// CHECK-SAME: %[[ARG:.*]]: tensor<1x4x?x64x?x8x1x1xf32>
+func @dynamic_reshape_to_collapse_shape(%arg0 : tensor<1x4x?x64x?x8x1x1xf32>)
+    -> tensor<?x?x8xf32> {
+  // CHECK: %[[RESULT:.*]] = linalg.tensor_collapse_shape %[[ARG]] {{\[}}[0, 1, 2], [3, 4], [5, 6, 7]{{\]}}
+  // CHECK: return %[[RESULT]]
+  %c2 = arith.constant 2 : index
+  %c4 = arith.constant 4 : index
+  %c4_i32 = arith.constant 4 : i32
+  %c8_i32 = arith.constant 8 : i32
+  %c64_i32 = arith.constant 64 : i32
+  %d2 = tensor.dim %arg0, %c2 : tensor<1x4x?x64x?x8x1x1xf32>
+  %d4 = tensor.dim %arg0, %c4 : tensor<1x4x?x64x?x8x1x1xf32>
+  %d2_i32 = arith.index_cast %d2 : index to i32
+  %d4_i32 = arith.index_cast %d4 : index to i32
+  %s0 = arith.muli %c4_i32, %d2_i32 : i32
+  %s1 = arith.muli %c64_i32, %d4_i32 : i32
+  %shape = tensor.from_elements %s0, %s1, %c8_i32 : tensor<3xi32>
+  %result = "mhlo.dynamic_reshape"(%arg0, %shape)
+      : (tensor<1x4x?x64x?x8x1x1xf32>, tensor<3xi32>) -> tensor<?x?x8xf32>
+  return %result : tensor<?x?x8xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @reshape_integration(
 // CHECK-SAME:      %arg0: tensor<512x512xf32>,
 // CHECK-SAME:      %arg1: tensor<?x8x?x64xf32>,
@@ -216,7 +241,7 @@ func @reshape_integration(%arg0: tensor<512x512xf32>, %arg1: tensor<?x8x?x64xf32
   %19 = shape.assuming %18 -> (tensor<?x512xf32>) {
     // CHECK-NOT: compute_reshape_shape
     %20 = mhlo.compute_reshape_shape %17, %15 : index, tensor<2xi32> -> tensor<2xi32>
-    // CHECK: "mhlo.dynamic_reshape"
+    // CHECK: linalg.tensor_collapse_shape
     %21 = "mhlo.dynamic_reshape"(%6, %20) : (tensor<?x?x64x8xf32>, tensor<2xi32>) -> tensor<?x512xf32>
     // CHECK-NOT: assuming_yield
     shape.assuming_yield %21 : tensor<?x512xf32>
diff --git a/tensorflow/compiler/mlir/hlo/tests/shape-component-analysis.mlir b/tensorflow/compiler/mlir/hlo/tests/shape-component-analysis.mlir
index 9222f694c94658..fcbf6a62d3722f 100644
--- a/tensorflow/compiler/mlir/hlo/tests/shape-component-analysis.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/shape-component-analysis.mlir
@@ -9,14 +9,24 @@ func @assuming(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2 : !shape.wi
   %2 = shape.shape_of %0#1 : tensor<?x?xf32> -> tensor<2xindex>
   %3 = arith.index_cast %1 : tensor<2xindex> to tensor<2xi32>
   %4 = arith.index_cast %2 : tensor<2xindex> to tensor<2xi32>
+  // CHECK:      Value info for %5 = mhlo.add %3, %4 : tensor<2xi32>
+  // CHECK-NEXT:   s0 + s1 with
+  // CHECK-NEXT:     s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0]
+  // CHECK-NEXT:     s1 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 1)[0]
+  // CHECK-NEXT:   s0 + s1 with
+  // CHECK-NEXT:     s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[1]
+  // CHECK-NEXT:     s1 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 1)[1]
   %5 = mhlo.add %3, %4 : tensor<2xi32>
-// CHECK: %5 = mhlo.add %3, %4 : tensor<2xi32>:
-// CHECK-NEXT: s0 + s1 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0]; s1 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 1)[0];
-// CHECK-NEXT: s0 + s1 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[1]; s1 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 1)[1];
+  // CHECK:      Value info for %6 = mhlo.multiply %5, %4 : tensor<2xi32>
+  // CHECK-NEXT:   (s0 + s1) * s2 with
+  // CHECK-NEXT:     s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0]
+  // CHECK-NEXT:     s1 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 1)[0]
+  // CHECK-NEXT:     s2 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 1)[0]
+  // CHECK-NEXT:   (s0 + s1) * s2 with
+  // CHECK-NEXT:     s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[1]
+  // CHECK-NEXT:     s1 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 1)[1]
+  // CHECK-NEXT:     s2 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 1)[1]
   %6 = mhlo.multiply %5, %4 : tensor<2xi32>
-// CHECK: %6 = mhlo.multiply %5, %4 : tensor<2xi32>:
-// CHECK-NEXT: (s0 + s1) * s2 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0]; s1 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 1)[0]; s2 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 1)[0];
-// CHECK-NEXT: (s0 + s1) * s2 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[1]; s1 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 1)[1]; s2 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 1)[1];
   return %6 : tensor<2xi32>
 }
 
@@ -24,17 +34,19 @@ func @assuming(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2 : !shape.wi
 
 // CHECK-LABEL: Testing : num_elements
 func @num_elements(%arg0: tensor<?x8x?x64xf32>) -> index {
+  // CHECK:      Value info for %0 = shape.shape_of %arg0 : tensor<?x8x?x64xf32> -> tensor<4xindex>
+  // CHECK-NEXT:   s0 with
+  // CHECK-NEXT:     s0 = shapeof(<block argument> of type 'tensor<?x8x?x64xf32>' at index: 0)[0]
+  // CHECK-NEXT:   8
+  // CHECK-NEXT:   s0 with
+  // CHECK-NEXT:     s0 = shapeof(<block argument> of type 'tensor<?x8x?x64xf32>' at index: 0)[2]
+  // CHECK-NEXT:   64
   %0 = shape.shape_of %arg0 : tensor<?x8x?x64xf32> -> tensor<4xindex>
-// CHECK-NEXT: %0 = shape.shape_of %arg0 : tensor<?x8x?x64xf32> -> tensor<4xindex>:
-// CHECK-NEXT:   s0 with s0 = shapeof(<block argument> of type 'tensor<?x8x?x64xf32>' at index: 0)[0];
-// CHECK-NEXT:   8
-// CHECK-NEXT:   s0 with s0 = shapeof(<block argument> of type 'tensor<?x8x?x64xf32>' at index: 0)[2];
-// CHECK-NEXT:   64
+  // CHECK:      Value info for %1 = shape.num_elements %0 : tensor<4xindex> -> index:
+  // CHECK-NEXT:   (s0 * s1) * 512 with
+  // CHECK-NEXT:     s0 = shapeof(<block argument> of type 'tensor<?x8x?x64xf32>' at index: 0)[0]
+  // CHECK-NEXT:     s1 = shapeof(<block argument> of type 'tensor<?x8x?x64xf32>' at index: 0)[2]
   %1 = shape.num_elements %0 : tensor<4xindex> -> index
-// CHECK: %1 = shape.num_elements %0 : tensor<4xindex> -> index:
-// CHECK-NEXT:   (s0 * s1) * 512 with
-// CHECK-SAME:   s0 = shapeof(<block argument> of type 'tensor<?x8x?x64xf32>' at index: 0)[0];
-// CHECK-SAME:   s1 = shapeof(<block argument> of type 'tensor<?x8x?x64xf32>' at index: 0)[2];
   return %1 : index
 }
 
@@ -44,10 +56,12 @@ func @num_elements(%arg0: tensor<?x8x?x64xf32>) -> index {
 func @dynamic_broadcast_in_dim(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<2xindex> {
   %0 = shape.shape_of %arg0 : tensor<?x?xf32> -> tensor<2xindex>
   %1 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK:      Value info for %2 = shape.shape_of %1 : tensor<?x?xf32> -> tensor<2xindex>
+  // CHECK-NEXT: s0 with
+  // CHECK-NEXT:   s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0]
+  // CHECK-NEXT: s0 with
+  // CHECK-NEXT:   s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[1]
   %2 = shape.shape_of %1 : tensor<?x?xf32> -> tensor<2xindex>
-// CHECK: %2 = shape.shape_of %1 : tensor<?x?xf32> -> tensor<2xindex>:
-// CHECK-NEXT: s0 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0];
-// CHECK-NEXT: s0 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[1];
   return %2 : tensor<2xindex>
 }
 
@@ -57,10 +71,12 @@ func @dynamic_broadcast_in_dim(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -
 func @dynamic_reshape(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<2xindex> {
   %0 = shape.shape_of %arg0 : tensor<?x?xf32> -> tensor<2xindex>
   %1 = "mhlo.dynamic_reshape"(%arg0, %0) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK:      Value info for %2 = shape.shape_of %1 : tensor<?x?xf32> -> tensor<2xindex>
+  // CHECK-NEXT: s0 with
+  // CHECK-NEXT:   s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0]
+  // CHECK-NEXT: s0 with
+  // CHECK-NEXT:   s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[1]
   %2 = shape.shape_of %1 : tensor<?x?xf32> -> tensor<2xindex>
-// CHECK: %2 = shape.shape_of %1 : tensor<?x?xf32> -> tensor<2xindex>:
-// CHECK-NEXT: s0 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0];
-// CHECK-NEXT: s0 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[1];
   return %2 : tensor<2xindex>
 }
 
@@ -73,10 +89,12 @@ func @reduce(%arg0: tensor<?x?x?xf32>, %arg1: tensor<f32>) -> tensor<2xindex> {
     %26 = mhlo.add %a, %b : tensor<f32>
     "mhlo.return"(%26) : (tensor<f32>) -> ()
   }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<?x?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+  // CHECK:      Value info for %1 = shape.shape_of %0 : tensor<?x?xf32> -> tensor<2xindex>
+  // CHECK-NEXT: s0 with
+  // CHECK-NEXT:   s0 = shapeof(<block argument> of type 'tensor<?x?x?xf32>' at index: 0)[0]
+  // CHECK-NEXT: s0 with
+  // CHECK-NEXT:   s0 = shapeof(<block argument> of type 'tensor<?x?x?xf32>' at index: 0)[2]
   %1 = shape.shape_of %0 : tensor<?x?xf32> -> tensor<2xindex>
-// CHECK: %1 = shape.shape_of %0 : tensor<?x?xf32> -> tensor<2xindex>:
-// CHECK-NEXT: s0 with s0 = shapeof(<block argument> of type 'tensor<?x?x?xf32>' at index: 0)[0];
-// CHECK-NEXT: s0 with s0 = shapeof(<block argument> of type 'tensor<?x?x?xf32>' at index: 0)[2];
   return %1 : tensor<2xindex>
 }
 
@@ -85,10 +103,12 @@ func @reduce(%arg0: tensor<?x?x?xf32>, %arg1: tensor<f32>) -> tensor<2xindex> {
 // CHECK-LABEL: Testing : transpose
 func @transpose(%arg0: tensor<?x?xf32>) -> tensor<2xindex> {
   %0 = "mhlo.transpose"(%arg0) {permutation = dense<[1, 0]> : tensor<2xi64>} : (tensor<?x?xf32>) -> tensor<?x?xf32>
+  // CHECK:      Value info for %1 = shape.shape_of %0 : tensor<?x?xf32> -> tensor<2xindex>
+  // CHECK-NEXT: s0 with
+  // CHECK-NEXT:   s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[1]
+  // CHECK-NEXT: s0 with
+  // CHECK-NEXT:   s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0]
   %1 = shape.shape_of %0 : tensor<?x?xf32> -> tensor<2xindex>
-// CHECK: %1 = shape.shape_of %0 : tensor<?x?xf32> -> tensor<2xindex>:
-// CHECK-NEXT: s0 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[1];
-// CHECK-NEXT: s0 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0];
   return %1 : tensor<2xindex>
 }
 
@@ -97,10 +117,12 @@ func @transpose(%arg0: tensor<?x?xf32>) -> tensor<2xindex> {
 // CHECK-LABEL: Testing : select
 func @select(%arg0: tensor<i1>, %arg1: tensor<?x?xf32>, %arg2: tensor<?x?xf32>) -> tensor<2xindex> {
   %0 = "mhlo.select"(%arg0, %arg1, %arg2)  : (tensor<i1>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
+  // CHECK:      Value info for %1 = shape.shape_of %0 : tensor<?x?xf32> -> tensor<2xindex>
+  // CHECK-NEXT: s0 with
+  // CHECK-NEXT:   s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 1)[0]
+  // CHECK-NEXT: s0 with
+  // CHECK-NEXT:   s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 1)[1]
   %1 = shape.shape_of %0 : tensor<?x?xf32> -> tensor<2xindex>
-// CHECK: %1 = shape.shape_of %0 : tensor<?x?xf32> -> tensor<2xindex>:
-// CHECK-NEXT: s0 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 1)[0];
-// CHECK-NEXT: s0 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 1)[1];
   return %1 : tensor<2xindex>
 }
 
@@ -111,9 +133,11 @@ func @dim(%arg0: tensor<?x?xf32>) -> tensor<2xindex> {
   %c0 = arith.constant 0 : index
   %d0 = tensor.dim %arg0, %c0 : tensor<?x?xf32>
   %t = tensor.from_elements %d0, %d0 : tensor<2xindex>
-// CHECK: tensor.from_elements %0, %0 : tensor<2xindex>:
-// CHECK-NEXT: s0 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0];
-// CHECK-NEXT: s0 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0];
+  // CHECK:      Value info for %1 = tensor.from_elements %0, %0 : tensor<2xindex>
+  // CHECK-NEXT:   s0 with
+  // CHECK-NEXT:     s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0]
+  // CHECK-NEXT:   s0 with
+  // CHECK-NEXT:     s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0]
   return %t : tensor<2xindex>
 }
 
@@ -124,10 +148,12 @@ func @extract(%arg0: tensor<?x?xf32>) -> tensor<2xindex> {
   %shape = shape.shape_of %arg0 : tensor<?x?xf32> -> tensor<2xindex>
   %c1 = arith.constant 1 : index
   %d0 = tensor.extract %shape[%c1] : tensor<2xindex>
+  // CHECK:      Value info for %2 = tensor.from_elements %1, %1 : tensor<2xindex>
+  // CHECK-NEXT:   s0 with
+  // CHECK-NEXT:     s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[1]
+  // CHECK-NEXT:   s0 with
+  // CHECK-NEXT:     s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[1]
   %t = tensor.from_elements %d0, %d0 : tensor<2xindex>
-// CHECK: tensor.from_elements %1, %1 : tensor<2xindex>:
-// CHECK-NEXT: s0 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[1];
-// CHECK-NEXT: s0 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[1];
   return %t : tensor<2xindex>
 }
 
@@ -144,15 +170,40 @@ func @symbolic_constraint(
   %1 = shape.shape_of %arg1 : tensor<?x?xf32> -> tensor<2xindex>
   %2 = arith.index_cast %0 : tensor<2xindex> to tensor<2xi32>
   %3 = arith.index_cast %1 : tensor<2xindex> to tensor<2xi32>
+  // CHECK:      Value info for %4 = mhlo.add %2, %3 : tensor<2xi32>:
+  // CHECK-NEXT:   s0 + s1 with
+  // CHECK-NEXT:     s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0]
+  // CHECK-NEXT:     s1 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 1)[0]
+  // CHECK-NEXT:   s0 + s1 with
+  // CHECK-NEXT:     s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[1]
+  // CHECK-NEXT:     s1 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[1]
   %4 = mhlo.add %2, %3 : tensor<2xi32>
-// CHECK: %4 = mhlo.add %2, %3 : tensor<2xi32>:
-// CHECK-NEXT: s0 + s1 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0]; s1 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 1)[0]
-// CHECK-NEXT: s0 + s1 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 1)[1]; s1 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 1)[1]
   return %4 : tensor<2xi32>
 }
 
 // -----
 
+// CHECK-LABEL: Testing : dynamic_reshape
+func @dynamic_reshape(%arg0: tensor<?x8x?x64xf32>, %arg1: tensor<4xi32>)
+    -> tensor<?x8x?x64xf32> {
+  %0 = shape.shape_of %arg0 : tensor<?x8x?x64xf32> -> tensor<4xindex>
+  %1 = shape.num_elements %0 : tensor<4xindex> -> index
+  %2 = mhlo.compute_reshape_shape %1, %arg1 : index, tensor<4xi32>
+      -> tensor<4xi32>
+  // CHECK:      Shape info for %3 = "mhlo.dynamic_reshape"(%arg0, %2) : (tensor<?x8x?x64xf32>, tensor<4xi32>) -> tensor<?x8x?x64xf32>
+  // CHECK-NEXT:   s0 with
+  // CHECK-NEXT:     s0 = %2 = mhlo.compute_reshape_shape %1, %arg1 : index, tensor<4xi32> -> tensor<4xi32>[0]
+  // CHECK-NEXT:   8
+  // CHECK-NEXT:   s0 with
+  // CHECK-NEXT:     s0 = %2 = mhlo.compute_reshape_shape %1, %arg1 : index, tensor<4xi32> -> tensor<4xi32>[2]
+  // CHECK-NEXT:   64
+  %3 = "mhlo.dynamic_reshape"(%arg0, %2)
+      : (tensor<?x8x?x64xf32>, tensor<4xi32>) -> tensor<?x8x?x64xf32>
+  return %3 : tensor<?x8x?x64xf32>
+}
+
+// -----
+
 // Larger examples.
 
 // CHECK-LABEL: Testing : softmax
@@ -167,31 +218,37 @@ func @softmax(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
   }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?xf32>
   %4 = "mhlo.convert"(%3) : (tensor<?xf32>) -> tensor<?xf32>
   %cst = arith.constant dense<1> : tensor<1xi32>
+  // CHECK:      Value info for %5 = shape.shape_of
+  // CHECK-NEXT:   s0 with
+  // CHECK-NEXT:     s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0]
   %5 = shape.shape_of %4 : tensor<?xf32> -> tensor<1xindex>
-// CHECK: %5 = shape.shape_of
-// CHECK-NEXT:   s0 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0]
   %c1 = arith.constant 1 : index
   %c0 = arith.constant 0 : index
+  // CHECK:      Value info for %6 = tensor.extract
+  // CHECK-NEXT:   s0 with
+  // CHECK-NEXT:     s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0]
   %6 = tensor.extract %5[%c0] : tensor<1xindex>
-// CHECK: %6 = tensor.extract
-// CHECK-NEXT:   s0 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0]
+  // CHECK:      Value info for %7 = tensor.from_elements
+  // CHECK-NEXT:   s0 with
+  // CHECK-NEXT:     s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0]
+  // CHECK-NEXT:   1
   %7 = tensor.from_elements %6, %c1 : tensor<2xindex>
-// CHECK: %7 = tensor.from_elements
-// CHECK-NEXT:   s0 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0]
-// CHECK-NEXT:   1
   %8 = "mhlo.dynamic_reshape"(%4, %7) : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x1xf32>
   %9 = shape.shape_of %arg0 : tensor<?x?xf32> -> tensor<2xindex>
   %10 = shape.shape_of %8 : tensor<?x1xf32> -> tensor<2xindex>
   %11 = shape.cstr_broadcastable %9, %10 : tensor<2xindex>, tensor<2xindex>
   %12 = shape.assuming %11 -> (tensor<?x?xf32>) {
+    // CHECK:      Value info for %26 = shape.shape_of %arg0 : tensor<?x?xf32> -> tensor<2xindex>:
+    // CHECK-NEXT:   s0 with
+    // CHECK-NEXT:     s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0]
+    // CHECK-NEXT:   s0 with
+    // CHECK-NEXT:     s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[1]
     %26 = shape.shape_of %arg0 : tensor<?x?xf32> -> tensor<2xindex>
-// CHECK: %26 = shape.shape_of %arg0 : tensor<?x?xf32> -> tensor<2xindex>:
-// CHECK-NEXT:  s0 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0]
-// CHECK-NEXT:  s0 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[1]
+    // CHECK:      Value info for %27 = shape.shape_of
+    // CHECK-NEXT:   s0 with
+    // CHECK-NEXT:     s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0]
+    // CHECK-NEXT:   1
     %27 = shape.shape_of %8 : tensor<?x1xf32> -> tensor<2xindex>
-// CHECK: %27 = shape.shape_of
-// CHECK-NEXT:  s0 with s0 = shapeof(<block argument> of type 'tensor<?x?xf32>' at index: 0)[0]
-// CHECK-NEXT:  1
     %28 = shape.broadcast %26, %27 : tensor<2xindex>, tensor<2xindex> -> tensor<2xindex>
     %29 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %28) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
     %30 = "mhlo.dynamic_broadcast_in_dim"(%8, %28) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x1xf32>, tensor<2xindex>) -> tensor<?x?xf32>
@@ -248,16 +305,20 @@ func @reshape_integration(%arg0: tensor<512x512xf32>, %arg1: tensor<?x8x?x64xf32
   %12 = "mhlo.reshape"(%11) : (tensor<1xi32>) -> tensor<i32>
   %13 = mhlo.multiply %10, %12 : tensor<i32>
   %14 = "mhlo.reshape"(%13) : (tensor<i32>) -> tensor<1xi32>
+  // CHECK:      Value info for %15 = "mhlo.concatenate"(%14, %0) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+  // CHECK-NEXT:   s0 * s1 with
+  // CHECK-NEXT:     s0 = <block argument> of type 'tensor<4xi32>' at index: 2[0]
+  // CHECK-NEXT:     s1 = <block argument> of type 'tensor<4xi32>' at index: 2[2]
+  // CHECK-NEXT:   512
   %15 = "mhlo.concatenate"(%14, %0) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-// CHECK: %15 = "mhlo.concatenate"(%14, %0) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>:
-// CHECK-NEXT: s0 * s1 with s0 = <block argument> of type 'tensor<4xi32>' at index: 2[0]; s1 = <block argument> of type 'tensor<4xi32>' at index: 2[2];
-// CHECK-NEXT: 512
+  // CHECK:      Value info for %16 = shape.shape_of %6 : tensor<?x?x64x8xf32> -> tensor<4xindex>:
+  // CHECK-NEXT:   s0 with
+  // CHECK-NEXT:     s0 = <block argument> of type 'tensor<4xi32>' at index: 2[0]
+  // CHECK-NEXT:   s0 with
+  // CHECK-NEXT:     s0 = <block argument> of type 'tensor<4xi32>' at index: 2[2]
+  // CHECK-NEXT:   64
+  // CHECK-NEXT:   8
   %16 = shape.shape_of %6 : tensor<?x?x64x8xf32> -> tensor<4xindex>
-// CHECK: %16 = shape.shape_of %6 : tensor<?x?x64x8xf32> -> tensor<4xindex>:
-// CHECK-NEXT: s0 with s0 = <block argument> of type 'tensor<4xi32>' at index: 2[0];
-// CHECK-NEXT: s0 with s0 = <block argument> of type 'tensor<4xi32>' at index: 2[2];
-// CHECK-NEXT: 64
-// CHECK-NEXT: 8
   %17 = shape.num_elements %16 : tensor<4xindex> -> index
   %18 = mhlo.cstr_reshapable %17, %15 : index, tensor<2xi32>
   %19 = shape.assuming %18 -> (tensor<?x512xf32>) {
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index 62225a979deb8f..ab4624937e9b69 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -890,6 +890,35 @@ LogicalResult Verify(GatherOp op) {
   return mlir::success();
 }
 
+//===----------------------------------------------------------------------===//
+// BroadcastToOp
+//===----------------------------------------------------------------------===//
+
+// Canonicalizes BroadcastToOp to ReshapeOp if the input and output has the same
+// number of elements.
+struct ConvertBroadcastToReshape : public OpRewritePattern<BroadcastToOp> {
+  using OpRewritePattern<BroadcastToOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(BroadcastToOp op,
+                                PatternRewriter &rewriter) const override {
+    auto input_type = op.input().getType().cast<ShapedType>();
+    auto output_type = op.getType().cast<ShapedType>();
+    if (!input_type.hasStaticShape() || !output_type.hasStaticShape() ||
+        input_type.getNumElements() != output_type.getNumElements()) {
+      return failure();
+    }
+
+    rewriter.replaceOpWithNewOp<ReshapeOp>(op, op.getType(), op.input(),
+                                           op.shape());
+    return success();
+  }
+};
+
+void BroadcastToOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<ConvertBroadcastToReshape>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // FullyConnectedOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index ace8ae0ffedb3b..e9ce7a11c6062f 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -140,7 +140,10 @@ class TFL_AnyTypeOf<list<Type> allowedRuntimeTypes, string description = "",
 
 class TFL_TensorOf<list<Type> allowedRuntimeTypes,
                    list<Type> allowedOpTypes = [AnyType]> :
-  TensorOf<allowedOpTypes>, TFL_RuntimeType<TensorOf<allowedRuntimeTypes>>;
+  TensorOf<allowedOpTypes>, TFL_RuntimeType<TensorOf<allowedRuntimeTypes>> {
+  // Set the summary equal to that representing the runtime types.
+  let summary = TensorOf<allowedRuntimeTypes>.summary;
+}
 
 class TFL_TensorOfOrNone<list<Type> allowedRuntimeTypes, string description = "",
                          list<Type> allowedOpTypes = [AnyType]> :
@@ -929,7 +932,7 @@ def TFL_ExternalConstOp : Op<TFL_Dialect, "external_const",
 def TFL_Conv2DOp : TFL_ConvOp<"conv_2d", "Convolution", 0,
       [DeclareOpInterfaceMethods<InferTypeOpInterface>,
        DeclareOpInterfaceMethods<TFL_ArithmeticCount>,
-       DynamicRangeQuantizableOp]> {
+       DynamicRangeQuantizedOpInterface]> {
   let hasCanonicalizer = 1;
 
   let extraClassDeclaration = [{
@@ -943,6 +946,10 @@ def TFL_Conv2DOp : TFL_ConvOp<"conv_2d", "Convolution", 0,
 
     // Returns whether the return types are compatible.
     static bool isCompatibleReturnTypes(TypeRange l, TypeRange r);
+
+    // DynamicRangeQuantizedOpInterface:
+    bool GetDynamicRangeQuantKernelSupport() { return true; }
+    std::vector<int> GetQuantizableOperandIndices() { return {1}; }
   }];
 }
 
@@ -991,7 +998,7 @@ def TFL_CumsumOp: TFL_Op<"cumsum", [
 def TFL_DepthwiseConv2DOp :
     TFL_ConvOp<"depthwise_conv_2d", "Depthwise-separable convolution", 3,
                 [DeclareOpInterfaceMethods<TFL_ArithmeticCount>,
-                DynamicRangeQuantizableOp]> {
+                DynamicRangeQuantizedOpInterface]> {
   let arguments = (
     ins TFL_TensorOf<[F32, QI8, QUI8, QI16]>:$input,
     TFL_TensorOf<[F32, QI8, QUI8]>:$filter,
@@ -1015,6 +1022,9 @@ def TFL_DepthwiseConv2DOp :
     std::vector<int> GetSparseOperands() { return {1}; }
     std::vector<std::vector<int>> GetFloatBlockSize() { return {}; }
     std::vector<std::vector<int>> GetQuantizedBlockSize() { return {}; }
+    // DynamicRangeQuantizedOpInterface:
+    bool GetDynamicRangeQuantKernelSupport() { return true; }
+    std::vector<int> GetQuantizableOperandIndices() { return {1}; }
   }];
 }
 
@@ -1034,7 +1044,7 @@ def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [
     AffineOpCoefficient<-1, 1>,
     TFL_SparseOp,
     DeclareOpInterfaceMethods<TFL_ArithmeticCount>,
-    DynamicRangeQuantizableOp]> {
+    DynamicRangeQuantizedOpInterface]> {
   let summary = "Fully connected op";
 
   let arguments = (ins
@@ -1071,6 +1081,10 @@ def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [
     std::vector<int> GetSparseOperands() { return {1}; }
     std::vector<std::vector<int>> GetFloatBlockSize() { return {{1, 4}}; }
     std::vector<std::vector<int>> GetQuantizedBlockSize() { return {{1, 16}}; }
+    // DynamicRangeQuantizedOpInterface:
+    bool RequireAsymmetricQuantizeInputsAttr() { return true; }
+    bool GetDynamicRangeQuantKernelSupport() { return true; }
+    std::vector<int> GetQuantizableOperandIndices() { return {1}; }
   }];
 }
 
@@ -1083,7 +1097,7 @@ def TFL_BatchMatMulOp : TFL_Op<"batch_matmul", [
            And<[CPred<"getElementTypeOrSelf($_op.getOperand(0)).isInteger(8)">,
                 CPred<"getElementTypeOrSelf($_op.getOperand(1)).isInteger(8)">,
                 CPred<"getElementTypeOrSelf($_op.getResult(0)).isInteger(32)">]>]>>,
-   DynamicRangeQuantizableOp]> {
+   DynamicRangeQuantizedOpInterface]> {
 
   let summary = "Batch Matrix Multiply Operator";
 
@@ -1114,6 +1128,13 @@ in the batch dimensions and broadcasting.
   );
 
   let hasOptions = 1;
+
+  let extraClassDeclaration = [{
+    // DynamicRangeQuantizedOpInterface:
+    bool RequireAsymmetricQuantizeInputsAttr() { return true; }
+    bool GetDynamicRangeQuantKernelSupport() { return true; }
+    std::vector<int> GetQuantizableOperandIndices() { return {1}; }
+  }];
 }
 
 def TFL_GatherOp : TFL_Op<"gather", [
@@ -3626,10 +3647,10 @@ def TFL_CastOp : TFL_Op<"cast", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I1, I16, I32, UI32, I64, TFL_Quint8, UI8, Complex<F<32>>]>:$input
+    TFL_TensorOf<[F32, I1, I16, I32, UI32, I64, TFL_Quint8, UI8, I8, Complex<F<32>>]>:$input
   );
 
-  let results = (outs TFL_TensorOf<[F32, I1, I16, I32, UI32, I64, TFL_Quint8, UI8, Complex<F<32>>]>:$output);
+  let results = (outs TFL_TensorOf<[F32, I1, I16, I32, UI32, I64, TFL_Quint8, UI8, I8, Complex<F<32>>]>:$output);
 
   // TFLite's cast op does not utilize CastOptions, instead derives types
   // from the TfLiteTensors.
@@ -3971,7 +3992,8 @@ def TFL_LSTMOp :
            TFL_OperandHasRank<15, 1>,          // output_gate_bias
            TFL_OperandIsNoneOrHasRank<16, 2>,  // projection_weights
            TFL_OperandIsNoneOrHasRank<17, 1>,  // projection_bias
-           TFL_StatefulOp]> {
+           TFL_StatefulOp,
+           DynamicRangeQuantizedOpInterface]> {
   let summary = "The full lstm operator";
 
   let description = [{
@@ -4075,6 +4097,12 @@ Ba et al. 'Layer Normalization'
   let extraClassDeclaration = [{
     // StatefulOpInterface:
     std::vector<int> GetStatefulOperands() { return {18, 19}; }
+    // DynamicRangeQuantizedOpInterface:
+    bool RequireAsymmetricQuantizeInputsAttr() { return true; }
+    bool GetDynamicRangeQuantKernelSupport() { return true; }
+    std::vector<int> GetQuantizableOperandIndices() {
+      return {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16};
+    }
   }];
 }
 
@@ -4107,7 +4135,8 @@ def TFL_UnidirectionalSequenceLSTMOp :
            TFL_OperandIsNoneOrHasRank<16, 2>,  // projection_weights
            TFL_OperandIsNoneOrHasRank<17, 1>,  // projection_bias
            TFL_StatefulOp,
-           DeclareOpInterfaceMethods<InferTypeOpInterface>
+           DeclareOpInterfaceMethods<InferTypeOpInterface>,
+           DynamicRangeQuantizedOpInterface
           ]> {
   let summary = "Unidirectional sequence lstm operator";
 
@@ -4198,6 +4227,13 @@ def TFL_UnidirectionalSequenceLSTMOp :
 
     // Compatiable return types check
     static bool isCompatibleReturnTypes(TypeRange l, TypeRange r);
+
+    // DynamicRangeQuantizedOpInterface:
+    bool RequireAsymmetricQuantizeInputsAttr() { return true; }
+    bool GetDynamicRangeQuantKernelSupport() { return true; }
+    std::vector<int> GetQuantizableOperandIndices() {
+      return {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16};
+    }
   }];
 }
 
@@ -4273,7 +4309,8 @@ def TFL_BidirectionalSequenceLSTMOp :
            TFL_OperandHasRank<32, 1>,  // bw_output_gate_bias
            TFL_OperandHasRank<33, 2>,  // bw_projection_weights
            TFL_OperandHasRank<34, 1>,  // bw_projection_bias
-           TFL_StatefulOp]> {
+           TFL_StatefulOp,
+           DynamicRangeQuantizedOpInterface]> {
   let summary = "Bidirectional sequence lstm operator";
 
   let description = [{
@@ -4387,6 +4424,14 @@ def TFL_BidirectionalSequenceLSTMOp :
   let extraClassDeclaration = [{
     // StatefulOpInterface:
     std::vector<int> GetStatefulOperands() { return {35, 36, 37, 38}; }
+
+    // DynamicRangeQuantizedOpInterface:
+    bool RequireAsymmetricQuantizeInputsAttr() { return true; }
+    bool GetDynamicRangeQuantKernelSupport() { return true; }
+    std::vector<int> GetQuantizableOperandIndices() {
+      return {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 16, 18, 19, 20, 21,
+              22, 23, 24, 25, 26, 27, 28, 33, 40, 41, 42, 43, 44, 45, 46, 47};
+    }
   }];
 }
 
@@ -4397,7 +4442,8 @@ def TFL_UnidirectionalSequenceRNNOp : TFL_Op<"unidirectional_sequence_rnn", [
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
     PredOpTrait<"input and constant value operands must have same element type",
       TFL_TCopVTEtAreSameAt<1, 2>>,
-    TFL_StatefulOp]> {
+    TFL_StatefulOp,
+    DynamicRangeQuantizedOpInterface]> {
   let summary = "Unidirectional sequence rnn operator";
 
   let description = [{
@@ -4447,6 +4493,13 @@ def TFL_UnidirectionalSequenceRNNOp : TFL_Op<"unidirectional_sequence_rnn", [
   let extraClassDeclaration = [{
     // StatefulOpInterface:
     std::vector<int> GetStatefulOperands() { return {4}; }
+
+    // DynamicRangeQuantizedOpInterface:
+    bool RequireAsymmetricQuantizeInputsAttr() { return true; }
+    bool GetDynamicRangeQuantKernelSupport() { return true; }
+    std::vector<int> GetQuantizableOperandIndices() {
+      return {1, 2};
+    }
   }];
 }
 
@@ -4504,7 +4557,8 @@ def TFL_SVDFOp :
     PredOpTrait<"the input and result tensor elemental types must be same",
       TFL_TCresVTEtIsSameAsOp<0, 0>>,
     TFL_StatefulOp,
-    AccumulatorUniformScale<3, 2, 4>]> {
+    AccumulatorUniformScale<3, 2, 4>,
+    DynamicRangeQuantizedOpInterface]> {
 
   let summary = "Single value decomposition filter operator";
 
@@ -4547,6 +4601,13 @@ def TFL_SVDFOp :
   let extraClassDeclaration = [{
     // StatefulOpInterface:
     std::vector<int> GetStatefulOperands() { return {4}; }
+
+    // DynamicRangeQuantizedOpInterface:
+    bool RequireAsymmetricQuantizeInputsAttr() { return true; }
+    bool GetDynamicRangeQuantKernelSupport() { return true; }
+    std::vector<int> GetQuantizableOperandIndices() {
+      return {1, 2};
+    }
   }];
 }
 
@@ -4811,6 +4872,8 @@ subsequent operation and then be optimized away, however.)
   let results = (outs
     TFL_TensorOf<[F32, I32, I1, I8, QI8, UI8, QUI8, I16, QI16, I64, Complex<F<32>>]>:$output
   );
+
+  let hasCanonicalizer = 1;
 }
 
 def TFL_RFFT2dOp : TFL_Op<"rfft2d", [NoSideEffect, NoQuantizableResult]> {
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization.td b/tensorflow/compiler/mlir/lite/quantization/quantization.td
index 09c2008f8bc455..1cdb3af10373c4 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization.td
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization.td
@@ -127,6 +127,63 @@ def SameOperandsAndResultsScale : OpInterface<"SameScalesOpInterface"> {
   }];
 }
 
+def DynamicRangeQuantizedOpInterface : OpInterface<
+  "DynamicRangeQuantizedOpInterface"> {
+  let description = [{
+    Interface for ops dynamic range quantization is supported.
+
+    If the op has the kernel support for dynamic range quantization, Q/DQ op
+    pairs connected to the op are rewritten by its quantized alternatives where
+    a new op uses Q ops for its operands instead of DQ op. Otherwise, it is
+    left as is for weight-only which means the weight is dequantized at runtime.
+
+    For example, if the kernel does not support dynamic range quantization the
+    graph will be converted into the following IR:
+
+    %q_w = "tfl.pseudo_qconst"() {
+         qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+    %w = "tfl.dequantize"(%q_w) :
+         (tensor<tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32>) ->
+         tensor<64x3x3x3xf32>
+    %conv = "tfl.conv_2d"(%input_act, %w, %bias)
+
+    but if it is supported it will be rewritten as:
+
+    %q_w = "tfl.pseudo_qconst"() {
+         qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+    %conv = "tfl.conv_2d"(%input_act, %q_w, %bias)
+
+    Note that this is part of reaching feature parity with the old quantizer for
+    dynamic range quantization except:
+    -  Only use_updated_hybrid_scheme=True is supported which means the ops with
+    the asymmetrically quantizing input support is enabled to use this feature
+    during MLIR graph rewriting passes while it is configurable in the old
+    quantizer. So when those ops are matched during graph rewriting passes,
+    MLIR quantizer will always ignore the pre-set value of the attribute, if
+    there's any, and set it to True. The reason behind this decision is that
+    generally activations of these ops show better accuracy with asymmetric
+    input quantization so we want to deprecate symmetric activation quantization
+    for those ops eventually.
+  }];
+
+  let methods = [
+    InterfaceMethod<
+      [{Returns the quantizable operand indices of the op.}],
+      "std::vector<int>", "GetQuantizableOperandIndices",
+      (ins), [{}], [{return {};}]>,
+    InterfaceMethod<
+      [{Returns whether the op has the kernel support for dynamic range
+      quantization.}],
+      "bool", "GetDynamicRangeQuantKernelSupport",
+      (ins), [{}], [{return false;}]>,
+    InterfaceMethod<
+      [{Returns whether the op requires asymmetric quantize input attribute
+      setting.}],
+      "bool", "RequireAsymmetricQuantizeInputsAttr",
+      (ins), [{}], [{return false;}]>,
+  ];
+}
+
 // Specify this trait if the op has a fixed output value range.
 class FixedResultScale<QuantizedType qt> : NativeOpTrait<!strconcat(
   "quant::FixedResult", qt.name, "Scale<", qt.asTraitArgsStr, ">::Impl")>;
@@ -149,7 +206,4 @@ class AffineOpCoefficient<int dim, int index> : NativeOpTrait<
 // Specify this trait if the op doesn't have quantizable output. We shouldn't
 // apply quantization on this op.
 def NoQuantizableResult : NativeOpTrait<"quant::NoQuantizableResult">;
-
-// Specify this trait if the op supports dynamic range quantization.
-def DynamicRangeQuantizableOp : NativeOpTrait<"quant::DynamicRangeQuantizableOp">;
 #endif // TF_Quantization
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/fallback_to_flex_ops.cc b/tensorflow/compiler/mlir/lite/quantization/tensorflow/fallback_to_flex_ops.cc
index e3d9ace0063b24..9e06a7492b2bbd 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/fallback_to_flex_ops.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/fallback_to_flex_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <string>
+#include <utility>
 
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "mlir/Pass/Pass.h"  // from @llvm-project
@@ -31,92 +32,97 @@ constexpr char kFlexOpNamePrefix[] = "Flex";
 constexpr char kDefaultMode[] = "DEFAULT";
 constexpr char kLegacyIntegerMode[] = "LEGACY_INTEGER";
 
-// Checks if the operation is allowlisted in the Legacy Integer mode.
-bool IsAllowListedOpInLegacyMode(Operation *op) {
-  if (llvm::isa<
-          // clang-format off
-          // go/keep-sorted start
-          TF::AbsOp,
-          TF::AddOp,
-          TF::AddV2Op,
-          TF::ArgMaxOp,
-          TF::AvgPoolOp,
-          TF::BiasAddOp,
-          TF::BucketizeOp,
-          TF::ConcatV2Op,
-          TF::ConstOp,
-          TF::Conv2DBackpropInputOp,
-          TF::Conv2DOp,
-          TF::DepthwiseConv2dNativeOp,
-          TF::FakeQuantWithMinMaxVarsOp,
-          TF::FakeQuantWithMinMaxVarsPerChannelOp,
-          TF::GatherV2Op,
-          TF::IdentityOp,
-          TF::MatMulOp,
-          TF::MaxPoolOp,
-          TF::MaximumOp,
-          TF::MeanOp,
-          TF::MinimumOp,
-          TF::MulOp,
-          TF::PadOp,
-          TF::PadV2Op,
-          TF::PartitionedCallOp,
-          TF::Relu6Op,
-          TF::ReluOp,
-          TF::ReshapeOp,
-          TF::SoftmaxOp,
-          TF::StatefulPartitionedCallOp,
-          TF::SubOp,
-          TF::TransposeOp
-          // go/keep-sorted end
-          // clang-format on
-          >(op)) {
-    return true;
-  }
-  return false;
+// Checks if the operation is TF FakeQuant ops.
+bool IsTfFakeQuantOp(Operation *op) {
+  return llvm::isa<
+      // clang-format off
+      TF::FakeQuantWithMinMaxArgsOp,
+      TF::FakeQuantWithMinMaxVarsOp,
+      TF::FakeQuantWithMinMaxVarsPerChannelOp
+      // clang-format on
+      >(op);
 }
 
-// Checks if the operation is allowlisted in the Default mode.
-bool IsAllowListedOpInDefaultMode(Operation *op) {
-  if (llvm::isa<
-          // clang-format off
-          // go/keep-sorted start
-          TF::BiasAddOp,
-          TF::ConstOp,
-          TF::Conv2DBackpropInputOp,
-          TF::Conv2DOp,
-          TF::DepthwiseConv2dNativeOp,
-          TF::FakeQuantWithMinMaxVarsOp,
-          TF::FakeQuantWithMinMaxVarsPerChannelOp,
-          TF::IdentityOp,
-          TF::MatMulOp,
-          TF::PartitionedCallOp,
-          TF::Relu6Op,
-          TF::ReluOp,
-          TF::StatefulPartitionedCallOp
-          // go/keep-sorted end
-          // clang-format on
-          >(op)) {
-    return true;
-  }
-  return false;
+// Checks if the operation is allowlisted in both modes. These ops are not
+// quantizable but is necessary to make the conversion successful.
+bool IsAlwaysAllowlistedOp(Operation *op) {
+  return llvm::isa<
+      // clang-format off
+      // go/keep-sorted start
+      TF::ConstOp,
+      TF::IdentityOp,
+      TF::PartitionedCallOp,
+      TF::StatefulPartitionedCallOp
+      // go/keep-sorted end
+      // clang-format on
+      >(op);
+}
+
+// Checks if the operation is quantizable in the Legacy Integer mode.
+bool IsQuantizableOpInLegacyMode(Operation *op) {
+  return llvm::isa<
+      // clang-format off
+      // go/keep-sorted start
+      TF::AbsOp,
+      TF::AddOp,
+      TF::AddV2Op,
+      TF::ArgMaxOp,
+      TF::AvgPoolOp,
+      TF::BiasAddOp,
+      TF::BucketizeOp,
+      TF::ConcatV2Op,
+      TF::Conv2DBackpropInputOp,
+      TF::Conv2DOp,
+      TF::DepthwiseConv2dNativeOp,
+      TF::GatherV2Op,
+      TF::MatMulOp,
+      TF::MaxPoolOp,
+      TF::MaximumOp,
+      TF::MeanOp,
+      TF::MinimumOp,
+      TF::MulOp,
+      TF::PadOp,
+      TF::PadV2Op,
+      TF::Relu6Op,
+      TF::ReluOp,
+      TF::ReshapeOp,
+      TF::SoftmaxOp,
+      TF::SubOp,
+      TF::TransposeOp
+      // go/keep-sorted end
+      // clang-format on
+      >(op);
+}
+
+// Checks if the operation is quantizable in the Default mode.
+bool IsQuantizableOpInDefaultMode(Operation *op) {
+  return llvm::isa<
+      // clang-format off
+      // go/keep-sorted start
+      TF::BiasAddOp,
+      TF::Conv2DBackpropInputOp,
+      TF::Conv2DOp,
+      TF::DepthwiseConv2dNativeOp,
+      TF::MatMulOp,
+      TF::Relu6Op,
+      TF::ReluOp
+      // go/keep-sorted end
+      // clang-format on
+      >(op);
 }
 
 // Checks if the operation can be fused with bias.
 inline bool IsFusibleWithBiasOp(Operation *op) {
-  if (llvm::isa<
-          // clang-format off
-          TF::MatMulOp,
-          TF::Conv2DOp,
-          TF::DepthwiseConv2dNativeOp,
-          TF::Conv2DBackpropInputOp,
-          TF::Conv3DOp,
-          TF::Conv3DBackpropInputV2Op
-          // clang-format on
-          >(op)) {
-    return true;
-  }
-  return false;
+  return llvm::isa<
+      // clang-format off
+      TF::MatMulOp,
+      TF::Conv2DOp,
+      TF::DepthwiseConv2dNativeOp,
+      TF::Conv2DBackpropInputOp,
+      TF::Conv3DOp,
+      TF::Conv3DBackpropInputV2Op
+      // clang-format on
+      >(op);
 }
 
 // If the Add op can be fused as bias, converts it to BiasAdd op.
@@ -213,10 +219,12 @@ class FallbackToFlexOps : public PassWrapper<FallbackToFlexOps, FunctionPass> {
 
   // Checks if the operation is allowlisted in the current mode.
   bool IsAllowListedOp(Operation *op) {
-    if (mode_ == kDefaultMode) {
-      return IsAllowListedOpInDefaultMode(op);
+    if (IsAlwaysAllowlistedOp(op) || IsTfFakeQuantOp(op)) {
+      return true;
+    } else if (mode_ == kDefaultMode) {
+      return IsQuantizableOpInDefaultMode(op);
     } else if (mode_ == kLegacyIntegerMode) {
-      return IsAllowListedOpInLegacyMode(op);
+      return IsQuantizableOpInLegacyMode(op);
     } else {
       mlir::emitError(getFunction().getLoc(), "Unregconized mode: " + mode_);
       signalPassFailure();
diff --git a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
index 854915eec97815..1ea1104187db44 100644
--- a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
@@ -289,3 +289,15 @@ func @keepCustomFlexOps(%arg0: tensor<1x10xf32>) -> tensor<1x10xf32> {
   // CHECK-NEXT: %3 = "tfl.custom"(%0) {custom_code = "FlexReadVariableOp"
   return %3 : tensor<1x10xf32>
 }
+
+// -----
+
+// Converts tfl.broadcast_to to tfl.reshape if input and output have the same
+// number of elements.
+// CHECK-LABEL: broadcast_to_to_reshape
+func @broadcast_to_to_reshape(%arg0: tensor<4x4x4xf32>, %arg1 : tensor<4xi32>) -> tensor<1x4x4x4xf32> {
+  %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<4x4x4xf32>, tensor<4xi32>) -> tensor<1x4x4x4xf32>
+  // CHECK: "tfl.reshape"
+  // CHECK-SAME: (tensor<4x4x4xf32>, tensor<4xi32>) -> tensor<1x4x4x4xf32>
+  return %0 : tensor<1x4x4x4xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/conv_2d_nchw.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/conv_2d_nchw.pbtxt
index 5f498a404a953a..272ed57ca24e82 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/conv_2d_nchw.pbtxt
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/conv_2d_nchw.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf_tfl_translate -tf-input-arrays=input -tf-input-shapes=1,8,8,2 -tf-input-data-types=DT_FLOAT -tf-output-arrays=output_0 -print-function-result-mapping %s -o - 2>&1 | FileCheck %s
+# RUN: tf_tfl_translate -tf-input-arrays=input -tf-input-shapes=1,2,8,2 -tf-input-data-types=DT_FLOAT -tf-output-arrays=output_0 -print-function-result-mapping %s -o - 2>&1 | FileCheck %s
 
 node {
   name: "input"
@@ -17,7 +17,7 @@ node {
           size: 1
         }
         dim {
-          size: 8
+          size: 2
         }
         dim {
           size: 8
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-dynamic-range.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-dynamic-range.mlir
index 8ba0323066d89a..83ffeafb6f254d 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-dynamic-range.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-dynamic-range.mlir
@@ -13,14 +13,18 @@ func @QuantizeConv2D(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112x64xf32>
 // CHECK: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,
 // CHECK: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]])
 // CHECK: %[[b:.*]] = arith.constant dense<-1.23697901> : tensor<64xf32>
-// CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq_w]], %[[b]])
+// CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq_w]], %[[b]]) {
+// CHECK-NOT: asymmetric_quantize_inputs = true
+// CHECK-SAME: dilation_h_factor = 1 : i32
 // CHECK: return %[[conv:.*]]
 
 // PerTensor: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<64x3x3x3xf32>
 // PerTensor: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}
 // PerTensor: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]]) : (tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<64x3x3x3xf32>
 // PerTensor: %[[b:.*]] = arith.constant dense<-1.23697901> : tensor<64xf32>
-// PerTensor: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq_w]], %[[b]])
+// PerTensor: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq_w]], %[[b]]) {
+// PerTensor-NOT: asymmetric_quantize_inputs = true
+// PerTensor-SAME: dilation_h_factor = 1 : i32
 // PerTensor: return %[[conv:.*]]
 }
 
@@ -36,14 +40,18 @@ func @QuantizeDepthwiseConv2D(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x11
 // CHECK: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:3, {1.000000e+00,1.000000e+00,1.000000e+00}
 // CHECK: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]])
 // CHECK: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<64xf32>
-// CHECK: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[dq_w]], %[[b]])
+// CHECK: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[dq_w]], %[[b]]) {
+// CHECK-NOT: asymmetric_quantize_inputs = true
+// CHECK-SAME: depth_multiplier = 4 : i32
 // CHECK: return %[[dconv:.*]]
 
 // PerTensor: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<64x3x3x3xf32>
 // PerTensor: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}
 // PerTensor: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]]) : (tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<64x3x3x3xf32>
 // PerTensor: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<64xf32>
-// PerTensor: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[dq_w]], %[[b]])
+// PerTensor: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[dq_w]], %[[b]]) {
+// PerTensor-NOT: asymmetric_quantize_inputs = true
+// PerTensor-SAME: depth_multiplier = 4 : i32
 // PerTensor: return %[[dconv:.*]]
 }
 
@@ -59,17 +67,72 @@ func @QuantizeFullyConnected(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112
 // CHECK: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}
 // CHECK: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]]) : (tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<512x12xf32>
 // CHECK: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<512xf32>
-// CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[dq_w]], %[[b]])
+// CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[dq_w]], %[[b]]) {
+// CHECK-NOT: fused_activation_function = "NONE"
+// CHECK-SAME: asymmetric_quantize_inputs = true
 // CHECK: return %[[fc:.*]]
 
 // PerTensor: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<512x12xf32>
 // PerTensor: %[[q_w:.*]]= "tfl.quantize"(%[[w:.*]]) {qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}
 // PerTensor: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w:.*]]) : (tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<512x12xf32>
 // PerTensor: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<512xf32>
-// PerTensor: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[dq_w:.*]], %[[b:.*]])
+// PerTensor: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[dq_w:.*]], %[[b:.*]]) {
+// PerTensor-NOT: fused_activation_function = "NONE"
+// PerTensor-SAME: asymmetric_quantize_inputs = true
 // PerTensor: return %[[fc:.*]]
 }
 
+// CHECK-LABEL: QuantizeBatchMatmulWithActConst
+// PerTensor-LABEL: QuantizeBatchMatmulWithActConst
+func @QuantizeBatchMatmulWithActConst(%arg0: tensor<1x3x3x512xf32>) -> tensor<1x3x3x12xf32> {
+  %w = arith.constant dense<127.0> : tensor<512x12xf32>
+  %mm = "tfl.batch_matmul"(%arg0, %w) {adj_x = false, adj_y = false} : (tensor<1x3x3x512xf32>, tensor<512x12xf32>) -> tensor<1x3x3x12xf32>
+  return %mm : tensor<1x3x3x12xf32>
+
+// CHECK: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<512x12xf32>
+// CHECK: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<512x12x!quant.uniform<i8:f32, 0.49803921568627452:-128>>}
+// CHECK: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]]) : (tensor<512x12x!quant.uniform<i8:f32, 0.49803921568627452:-128>>) -> tensor<512x12xf32>
+// CHECK: %[[mm:.*]] = "tfl.batch_matmul"(%arg0, %[[dq_w]]) {adj_x = false, adj_y = false
+// CHECK-SAME: , asymmetric_quantize_inputs = true
+// CHECK: return %[[mm:.*]]
+
+// PerTensor: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<512x12xf32>
+// PerTensor: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<512x12x!quant.uniform<i8:f32, 0.49803921568627452:-128>>}
+// PerTensor: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]]) : (tensor<512x12x!quant.uniform<i8:f32, 0.49803921568627452:-128>>) -> tensor<512x12xf32>
+// PerTensor: %[[mm:.*]] = "tfl.batch_matmul"(%arg0, %[[dq_w]]) {adj_x = false, adj_y = false
+// PerTensor-SAME: , asymmetric_quantize_inputs = true
+// PerTensor: return %[[mm:.*]]
+}
+
+// CHECK-LABEL: NotQuantizeBatchMatmulWithConstAct
+// PerTensor-LABEL: NotQuantizeBatchMatmulWithConstAct
+func @NotQuantizeBatchMatmulWithConstAct(%arg0: tensor<1x1x3x512xf32>) -> tensor<1x1x12x3xf32> {
+  %w = arith.constant dense<127.0> : tensor<1x1x12x512xf32>
+  %mm = "tfl.batch_matmul"(%w, %arg0) {adj_x = false, adj_y = true} : (tensor<1x1x12x512xf32>, tensor<1x1x3x512xf32>) -> tensor<1x1x12x3xf32>
+  return %mm : tensor<1x1x12x3xf32>
+
+// CHECK: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<1x1x12x512xf32>
+// CHECK: %[[mm:.*]] = "tfl.batch_matmul"(%[[w]], %arg0) {adj_x = false, adj_y = true}
+// CHECK: return %[[mm:.*]]
+
+// PerTensor: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<1x1x12x512xf32>
+// PerTensor: %[[mm:.*]] = "tfl.batch_matmul"(%[[w]], %arg0) {adj_x = false, adj_y = true}
+// PerTensor: return %[[mm:.*]]
+}
+
+// CHECK-LABEL: NotQuantizeBatchMatmulWithActAct
+// PerTensor-LABEL: NotQuantizeBatchMatmulWithActAct
+func @NotQuantizeBatchMatmulWithActAct(%arg0: tensor<1x3x3x512xf32>) -> tensor<1x3x3x3xf32> {
+  %mm = "tfl.batch_matmul"(%arg0, %arg0) {adj_x = false, adj_y = true} : (tensor<1x3x3x512xf32>, tensor<1x3x3x512xf32>) -> tensor<1x3x3x3xf32>
+  return %mm : tensor<1x3x3x3xf32>
+
+// CHECK: %[[mm:.*]] = "tfl.batch_matmul"(%arg0, %arg0) {adj_x = false, adj_y = true}
+// CHECK: return %[[mm:.*]]
+
+// PerTensor: %[[mm:.*]] = "tfl.batch_matmul"(%arg0, %arg0) {adj_x = false, adj_y = true}
+// PerTensor: return %[[mm:.*]]
+}
+
 // CHECK-LABEL: NotQuantizeTransposeConv
 // PerTensor-LABEL: NotQuantizeTransposeConv
 func @NotQuantizeTransposeConv(%arg0: tensor<32x4x4x128xf32>, %arg1: tensor<4xi32>) -> tensor<1x32x42x128xf32> {
@@ -80,12 +143,16 @@ func @NotQuantizeTransposeConv(%arg0: tensor<32x4x4x128xf32>, %arg1: tensor<4xi3
 
 // CHECK: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<1x32x42x128xf32>
 // CHECK: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<1x32x42x128xf32>
-// CHECK: %[[tconv:.*]] = "tfl.transpose_conv"(%arg1, %[[w]], %arg0, %[[b]])
+// CHECK: %[[tconv:.*]] = "tfl.transpose_conv"(%arg1, %[[w]], %arg0, %[[b]]) {
+// CHECK-NOT: asymmetric_quantize_inputs = true
+// CHECK-SAME: padding = "SAME"
 // CHECK: return %[[tconv:.*]]
 
 // PerTensor: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<1x32x42x128xf32>
 // PerTensor: %[[b:.*]]= arith.constant dense<0.000000e+00> : tensor<1x32x42x128xf32>
-// PerTensor: %[[tconv:.*]] = "tfl.transpose_conv"(%arg1, %[[w:.*]], %arg0, %[[b:.*]])
+// PerTensor: %[[tconv:.*]] = "tfl.transpose_conv"(%arg1, %[[w:.*]], %arg0, %[[b:.*]]) {
+// PerTensor-NOT: asymmetric_quantize_inputs = true
+// PerTensor-SAME: padding = "SAME"
 // PerTensor: return %[[tconv:.*]]
 }
 
@@ -107,7 +174,9 @@ func @QuantizeMultiUses(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112x122x
 // CHECK: %[[b:.*]] = arith.constant dense<-1.23697901> : tensor<64xf32>
 // CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq_w2]], %[[b]])
 // CHECK: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[dq_w1]], %[[b]])
-// CHECK: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]])
+// CHECK: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]]) {adj_x = false, adj_y = true
+// CHECK-NOT: , asymmetric_quantize_inputs = true
+// CHECK-SAME: }
 // CHECK: return %[[bmm:.*]]
 
 // PerTensor: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<64x3x3x3xf32>
@@ -118,6 +187,8 @@ func @QuantizeMultiUses(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112x122x
 // PerTensor: %[[b:.*]] = arith.constant dense<-1.23697901> : tensor<64xf32>
 // PerTensor: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[dq_w2]], %[[b]])
 // PerTensor: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[dq_w1]], %[[b]])
-// PerTensor: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]])
+// PerTensor: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]]) {adj_x = false, adj_y = true
+// PerTensor-NOT: , asymmetric_quantize_inputs = true
+// PerTensor-SAME: }
 // PerTensor: return %[[bmm:.*]]
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
index 0c2d00b3f3d6ac..47edcea00b0d2b 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
@@ -686,7 +686,7 @@ func @QuantDequantTranspose(%arg0: tensor<2x3xf32>) -> (tensor<2x4xf32>) {
 
   // CHECK-LABEL: QuantDequantTranspose
   // CHECK: %[[CST:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<?xi32>
-  // CHECK: %[[CST_0:.*]] = arith.constant dense<1.00392151> : tensor<3x4xf32>
+  // CHECK: %[[CST_0:.*]] = arith.constant dense<1.00392163> : tensor<3x4xf32>
   // CHECK: %[[QUANT:.*]] = "tfl.quantize"(%[[CST_0]]) {qtype = tensor<3x4x!quant.uniform<u8:f32:1, {0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128}>>} : (tensor<3x4xf32>) -> tensor<3x4x!quant.uniform<u8:f32:1, {0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128}>>
   // CHECK: %[[DEQUANT:.*]] = "tfl.dequantize"(%[[QUANT]]) : (tensor<3x4x!quant.uniform<u8:f32:1, {0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128,0.0078431372549019607:128}>>) -> tensor<3x4xf32>
   // CHECK: %[[TRANSPOSE:.*]] = "tf.Transpose"(%[[DEQUANT]], %[[CST]]) : (tensor<3x4xf32>, tensor<?xi32>) -> tensor<*xf32>
@@ -694,4 +694,58 @@ func @QuantDequantTranspose(%arg0: tensor<2x3xf32>) -> (tensor<2x4xf32>) {
   // CHECK: return %[[MATMUL]] : tensor<2x4xf32>
 }
 
+func @UnsupportedGroupConv(%arg0: tensor<?x128x24xf32>) -> (tensor<?x6x14xf32>) {
+  %cst = "tf.Const"() {value = dense<0.000000e+00> : tensor<3x2x14xf32>} : () -> tensor<3x2x14xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<14xf32>} : () -> tensor<14xf32>
+  %cst_1 = "tf.Const"() {value = dense<0.000000e+00> : tensor<3x24x14xf32>} : () -> tensor<3x24x14xf32>
+  %cst_2 = "tf.Const"() {value = dense<0.000000e+00> : tensor<14xf32>} : () -> tensor<14xf32>
+  %cst_3 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %cst_4 = "tf.Const"() {value = dense<-3> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.ExpandDims"(%arg0, %cst_4) {device = ""} : (tensor<?x128x24xf32>, tensor<i32>) -> tensor<?x1x128x24xf32>
+  %1 = "tf.ExpandDims"(%cst, %cst_3) {device = ""} : (tensor<3x2x14xf32>, tensor<i32>) -> tensor<1x3x2x14xf32>
+  %2 = "tf.ExpandDims"(%cst_1, %cst_3) {device = ""} : (tensor<3x24x14xf32>, tensor<i32>) -> tensor<1x3x24x14xf32>
+  %3 = "tf.Conv2D"(%0, %2) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 5, 1], use_cudnn_on_gpu = true} : (tensor<?x1x128x24xf32>, tensor<1x3x24x14xf32>) -> tensor<?x1x26x14xf32>
+  %4 = "tf.Squeeze"(%3) {device = "", squeeze_dims = [-3]} : (tensor<?x1x26x14xf32>) -> tensor<?x26x14xf32>
+  %5 = "tf.BiasAdd"(%4, %cst_2) {data_format = "NHWC", device = ""} : (tensor<?x26x14xf32>, tensor<14xf32>) -> tensor<?x26x14xf32>
+  %6 = "tf.ExpandDims"(%5, %cst_4) {device = ""} : (tensor<?x26x14xf32>, tensor<i32>) -> tensor<?x1x26x14xf32>
+  %7 = "tf.Conv2D"(%6, %1) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 5, 1], use_cudnn_on_gpu = true} : (tensor<?x1x26x14xf32>, tensor<1x3x2x14xf32>) -> tensor<?x1x6x14xf32>
+  %8 = "tf.Squeeze"(%7) {device = "", squeeze_dims = [-3]} : (tensor<?x1x6x14xf32>) -> tensor<?x6x14xf32>
+  %9 = "tf.BiasAdd"(%8, %cst_0) {data_format = "NHWC", device = ""} : (tensor<?x6x14xf32>, tensor<14xf32>) -> tensor<?x6x14xf32>
+  %10 = "tf.Identity"(%9) {device = ""} : (tensor<?x6x14xf32>) -> tensor<?x6x14xf32>
+  %11 = "tf.Identity"(%10) {device = ""} : (tensor<?x6x14xf32>) -> tensor<?x6x14xf32>
+  return %11 : tensor<?x6x14xf32>
+
+  // CHECK-LABEL: UnsupportedGroupConv
+  // CHECK: "tfl.conv_2d"
+  // CHECK-NOT: "tfl.conv_2d"
+  // CHECK: "tf.Conv2D"
+}
+
+func @UnsupportedGroupConv_UnrankedTensorType(%arg0: tensor<*xf32>, %arg1: tensor<1x3x2x14xf32>) -> (tensor<?x1x6x14xf32>) {
+  %0 = "tf.Conv2D"(%arg0, %arg1) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 5, 1], use_cudnn_on_gpu = true} : (tensor<*xf32>, tensor<1x3x2x14xf32>) -> tensor<?x1x6x14xf32>
+  return %0 : tensor<?x1x6x14xf32>
+
+  // CHECK-LABEL: UnsupportedGroupConv_UnrankedTensorType
+  // CHECK-NOT: "tfl.conv_2d"
+  // CHECK: "tf.Conv2D"
+}
+
+func @UnsupportedGroupConv_DynamicDimAtInputDimThree(%arg0: tensor<?x1x26x?xf32>, %arg1: tensor<1x3x2x14xf32>) -> (tensor<?x1x6x14xf32>) {
+  %0 = "tf.Conv2D"(%arg0, %arg1) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 5, 1], use_cudnn_on_gpu = true} : (tensor<?x1x26x?xf32>, tensor<1x3x2x14xf32>) -> tensor<?x1x6x14xf32>
+  return %0 : tensor<?x1x6x14xf32>
+
+  // CHECK-LABEL: UnsupportedGroupConv_DynamicDimAtInputDimThree
+  // CHECK-NOT: "tfl.conv_2d"
+  // CHECK: "tf.Conv2D"
+}
+
+func @UnsupportedGroupConv_MultipleGroup(%arg0: tensor<?x1x26x14xf32>, %arg1: tensor<1x3x2x14xf32>) -> (tensor<?x1x6x14xf32>) {
+  %0 = "tf.Conv2D"(%arg0, %arg1) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 5, 1], use_cudnn_on_gpu = true} : (tensor<?x1x26x14xf32>, tensor<1x3x2x14xf32>) -> tensor<?x1x6x14xf32>
+  return %0 : tensor<?x1x6x14xf32>
+
+  // CHECK-LABEL: UnsupportedGroupConv_MultipleGroup
+  // CHECK-NOT: "tfl.conv_2d"
+  // CHECK: "tf.Conv2D"
+}
+
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/quantize-dynamic-range.mlir b/tensorflow/compiler/mlir/lite/tests/quantize-dynamic-range.mlir
index b22689b1c34eda..d90ef3a614faff 100644
--- a/tensorflow/compiler/mlir/lite/tests/quantize-dynamic-range.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/quantize-dynamic-range.mlir
@@ -11,12 +11,16 @@ func @QuantizeConv2D(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112x64xf32>
 
 // CHECK: %[[b:.*]] = arith.constant dense<-1.23697901> : tensor<64xf32>
 // CHECK: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {
-// CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[w]], %[[b]])
+// CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[w]], %[[b]]) {
+// CHECK-NOT: asymmetric_quantize_inputs = true
+// CHECK-SAME: dilation_h_factor = 1 : i32
 // CHECK: return %[[conv:.*]]
 
 // PerTensor: %[[b:.*]] = arith.constant dense<-1.23697901> : tensor<64xf32>
 // PerTensor: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
-// PerTensor: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[w]], %[[b]])
+// PerTensor: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[w]], %[[b]]) {
+// PerTensor-NOT: asymmetric_quantize_inputs = true
+// PerTensor-SAME: dilation_h_factor = 1 : i32
 // PerTensor: return %[[conv:.*]]
 }
 
@@ -30,12 +34,16 @@ func @QuantizeDepthwiseConv2D(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x11
 
 // CHECK: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<64xf32>
 // CHECK: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:3, {1.000000e+00,1.000000e+00,1.000000e+00}
-// CHECK: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[w]], %[[b]])
+// CHECK: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[w]], %[[b]]) {
+// CHECK-NOT: asymmetric_quantize_inputs = true
+// CHECK-SAME: depth_multiplier = 4 : i32
 // CHECK: return %[[dconv:.*]]
 
 // PerTensor: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<64xf32>
 // PerTensor: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
-// PerTensor: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[w]], %[[b]])
+// PerTensor: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[w]], %[[b]]) {
+// PerTensor-NOT: asymmetric_quantize_inputs = true
+// PerTensor-SAME: depth_multiplier = 4 : i32
 // PerTensor: return %[[dconv:.*]]
 }
 
@@ -49,15 +57,37 @@ func @QuantizeFullyConnected(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112
 
 // CHECK: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<512xf32>
 // CHECK: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
-// CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[w]], %[[b]])
+// CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[w]], %[[b]]) {
+// CHECK-NOT: fused_activation_function = "NONE",
+// CHECK-SAME: asymmetric_quantize_inputs = true,
 // CHECK: return %[[fc:.*]]
 
 // PerTensor: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<512xf32>
 // PerTensor: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
-// PerTensor: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[w]], %[[b]])
+// PerTensor: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[w]], %[[b]]) {
+// PerTensor-NOT: fused_activation_function = "NONE",
+// PerTensor-SAME: asymmetric_quantize_inputs = true,
 // PerTensor: return %[[fc:.*]]
 }
 
+// CHECK-LABEL: QuantizeMatmulWithActConst
+// PerTensor-LABEL: QuantizeMatmulWithActConst
+func @QuantizeMatmulWithActConst(%arg0: tensor<1x3x3x512xf32>) -> tensor<1x3x3x12xf32> {
+  %w = arith.constant dense<127.0> : tensor<512x12xf32>
+  %mm = "tfl.batch_matmul"(%arg0, %w) {adj_x = false, adj_y = false} : (tensor<1x3x3x512xf32>, tensor<512x12xf32>) -> tensor<1x3x3x12xf32>
+  return %mm : tensor<1x3x3x12xf32>
+
+// CHECK: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<512x12x!quant.uniform<i8:f32, 0.49803921568627452:-128>>,
+// CHECK: %[[mm:.*]] = "tfl.batch_matmul"(%arg0, %[[w]]) {adj_x = false, adj_y = false
+// CHECK-SAME: , asymmetric_quantize_inputs = true
+// CHECK: return %[[mm:.*]]
+
+// PerTensor: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<512x12x!quant.uniform<i8:f32, 0.49803921568627452:-128>>,
+// PerTensor: %[[mm:.*]] = "tfl.batch_matmul"(%arg0, %[[w]]) {adj_x = false, adj_y = false
+// PerTensor-SAME: , asymmetric_quantize_inputs = true
+// PerTensor: return %[[mm:.*]]
+}
+
 // CHECK-LABEL: NotQuantizeTransposeConv
 // PerTensor-LABEL: NotQuantizeTransposeConv
 func @NotQuantizeTransposeConv(%arg0: tensor<32x4x4x128xf32>, %arg1: tensor<4xi32>) -> tensor<1x32x42x128xf32> {
@@ -68,12 +98,16 @@ func @NotQuantizeTransposeConv(%arg0: tensor<32x4x4x128xf32>, %arg1: tensor<4xi3
 
 // CHECK: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<1x32x42x128xf32>
 // CHECK: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<1x32x42x128xf32>
-// CHECK: %[[tconv:.*]] = "tfl.transpose_conv"(%arg1, %[[w]], %arg0, %[[b]])
+// CHECK: %[[tconv:.*]] = "tfl.transpose_conv"(%arg1, %[[w]], %arg0, %[[b]]) {
+// CHECK-NOT: asymmetric_quantize_inputs = true
+// CHECK-SAME: padding = "SAME"
 // CHECK: return %[[tconv:.*]]
 
 // PerTensor: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<1x32x42x128xf32>
 // PerTensor: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<1x32x42x128xf32>
-// PerTensor: %[[tconv:.*]] = "tfl.transpose_conv"(%arg1, %[[w]], %arg0, %[[b]])
+// PerTensor: %[[tconv:.*]] = "tfl.transpose_conv"(%arg1, %[[w]], %arg0, %[[b]]) {
+// PerTensor-NOT: asymmetric_quantize_inputs = true
+// PerTensor-SAME: padding = "SAME"
 // PerTensor: return %[[tconv:.*]]
 }
 
@@ -92,7 +126,9 @@ func @QuantizeMultiUses(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112x122x
 // CHECK: %[[w2:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00
 // CHECK: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[w2]], %[[b]])
 // CHECK: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[w1]], %[[b]])
-// CHECK: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]])
+// CHECK: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]]) {adj_x = false, adj_y = true
+// CHECK-NOT: , asymmetric_quantize_inputs = true
+// CHECK-SAME: }
 // CHECK: return %[[bmm:.*]]
 
 // PerTensor: %[[b:.*]] = arith.constant dense<-1.23697901> : tensor<64xf32>
@@ -100,6 +136,8 @@ func @QuantizeMultiUses(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112x122x
 // PerTensor: %[[w2:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<64x3x3x3x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
 // PerTensor: %[[conv:.*]] = "tfl.conv_2d"(%arg0, %[[w2]], %[[b]])
 // PerTensor: %[[dconv:.*]] = "tfl.depthwise_conv_2d"(%arg0, %[[w1]], %[[b]])
-// PerTensor: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]])
+// PerTensor: %[[bmm:.*]] = "tfl.batch_matmul"(%[[conv]], %[[dconv]]) {adj_x = false, adj_y = true
+// PerTensor-NOT: , asymmetric_quantize_inputs = true
+// PerTensor-SAME: }
 // PerTensor: return %[[bmm:.*]]
 }
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
index dae6c10d9457d4..47118b9185c7f4 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+
 #include "llvm/Support/CommandLine.h"
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
@@ -37,6 +39,10 @@ namespace TFL {
 
 namespace {
 
+// A boolean attribute used to describe whether input activations need to be
+// asymmetrically quantized.
+constexpr char kAsymmetricQuantizeInputsAttr[] = "asymmetric_quantize_inputs";
+
 using QuantizationUnits = llvm::SetVector<std::pair<Operation*, int>>;
 
 // Applies prepare dynamic range quantization on the model in TFL dialect.
@@ -81,10 +87,10 @@ class PrepareDynamicRangeQuantizePass
 
 // If the weight is applicable to dynamic range quantization, insert Quantize
 // and Dequantize ops with either per-axis or per-tensor scale.
-class PreprocessDynamicRangeQuantizableOp
+class PrepareDynamicRangeQuantizableOp
     : public OpRewritePattern<arith::ConstantOp> {
  public:
-  explicit PreprocessDynamicRangeQuantizableOp(
+  explicit PrepareDynamicRangeQuantizableOp(
       MLIRContext* context, const QuantizationSpecs& quant_specs)
       : OpRewritePattern<arith::ConstantOp>(context),
         quant_specs_(quant_specs) {}
@@ -100,10 +106,28 @@ class PreprocessDynamicRangeQuantizableOp
     if (!(quantizeOps(op, quantizable_ops, rewriter))) {
       return failure();
     }
+
+    if (!(setAsymmetricQuantizeInputAttr(quantizable_ops, rewriter))) {
+      return failure();
+    }
     return success();
   }
 
  private:
+  // Check if any specific operand is supported for int8 quantization.
+  bool hasInt8QuantizableOperandAt(Operation* op, int operand_index) const {
+    // TODO(b/201599094): check whether weight size < 1024 condition is needed
+    // here
+    if (auto quantizable_op = dyn_cast<DynamicRangeQuantizedOpInterface>(op)) {
+      const auto& quantizable_indices =
+          quantizable_op.GetQuantizableOperandIndices();
+      return std::find(std::begin(quantizable_indices),
+                       std::end(quantizable_indices),
+                       operand_index) != std::end(quantizable_indices);
+    }
+    return false;
+  }
+
   // Mark users that are applicable for dynamic range quantization if it
   // uses float tensors which are not biases and is a DynamicRangeQuantizableOp.
   bool getQuantizableOps(arith::ConstantOp op,
@@ -115,17 +139,11 @@ class PreprocessDynamicRangeQuantizableOp
     Value value = op.getResult();
 
     // Check whether dynamic-quantization can be applied.
-    // TODO(b/201599094): check whether weight size < 1024 condition is needed
-    // here
     for (auto& use : value.getUses()) {
       Operation* user = use.getOwner();
       int operand_num = use.getOperandNumber();
 
-      auto spec = GetOpQuantSpec(user);
-      auto biases = spec->biases_params;
-
-      if (biases.find(operand_num) == biases.end() &&
-          user->hasTrait<OpTrait::quant::DynamicRangeQuantizableOp>()) {
+      if (hasInt8QuantizableOperandAt(user, operand_num)) {
         quantizable_ops.insert({user, operand_num});
       }
     }
@@ -152,8 +170,7 @@ class PreprocessDynamicRangeQuantizableOp
     Operation* quantize_op = quant_op.first;
     int quantize_operand_num = quant_op.second;
 
-    auto affine_user =
-        llvm::dyn_cast<mlir::AffineQuantizedOpInterface>(quantize_op);
+    auto affine_user = dyn_cast<AffineQuantizedOpInterface>(quantize_op);
 
     bool op_with_narrow_range =
         affine_user &&
@@ -200,6 +217,31 @@ class PreprocessDynamicRangeQuantizableOp
     return true;
   }
 
+  // Add asymmetric input quantization attribute. MLIR dynamic quantization
+  // supports only the case that the value of the attribute equals to true. For
+  // details, see tensorflow/compiler/mlir/lite/quantization/quantization.td
+  bool setAsymmetricQuantizeInputAttr(QuantizationUnits& quantizable_ops,
+                                      PatternRewriter& rewriter) const {
+    bool changed = false;
+    for (auto& quant_op : quantizable_ops) {
+      auto dynamic_range_quantized_user =
+          dyn_cast<DynamicRangeQuantizedOpInterface>(quant_op.first);
+      if (dynamic_range_quantized_user &&
+          dynamic_range_quantized_user.RequireAsymmetricQuantizeInputsAttr()) {
+        // At runtime, this flag will be used in the kernels to decide whether
+        // input activations need to be asymmetrically quantized. Refer to the
+        // implementation for fully-connected as an example in
+        // tensorflow/lite/kernels/fully_connected.cc. The kernels will handle
+        // the asymmetric_quantize_inputs attribute in the builtin option.
+        dynamic_range_quantized_user->setAttr(
+            kAsymmetricQuantizeInputsAttr,
+            BoolAttr::get(rewriter.getContext(), true));
+        changed = true;
+      }
+    }
+    return changed;
+  }
+
  protected:
   QuantizationSpecs quant_specs_;
 };
@@ -211,7 +253,7 @@ void PrepareDynamicRangeQuantizePass::runOnFunction() {
   ConvertTFLQuantOpsToMlirQuantOps(func);
 
   OwningRewritePatternList patterns(&getContext());
-  patterns.insert<PreprocessDynamicRangeQuantizableOp>(ctx, quant_specs_);
+  patterns.insert<PrepareDynamicRangeQuantizableOp>(ctx, quant_specs_);
   (void)applyPatternsAndFoldGreedily(func, std::move(patterns));
 
   ConvertMlirQuantOpsToTFLQuantOps(func);
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index 48ff0389a56969..f322d27ddd660e 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -225,6 +225,21 @@ class ConvertTFConvOp : public RewritePattern {
         !filter_type.hasStaticShape())
       return failure();
 
+    Value input = tf_op.input();
+    RankedTensorType input_type =
+        input.getType().template dyn_cast<RankedTensorType>();
+    // Safe guard for skipping grouped convolution legalization.
+    // Only rank size four input will be only available by the tf.Conv2D
+    // operator verification.
+    if (!input_type || input_type.isDynamicDim(3)) {
+      return failure();
+    }
+    // Check if the given op is based on unsupported grouped convolution.
+    // Dim size zero will be verified by the tf.Conv2D operator verification.
+    if (input_type.getDimSize(3) / filter_type.getDimSize(2) != 1) {
+      return failure();
+    }
+
     // TensorFlow convolution op only has two inputs, while the TFLite one has
     // three, with the bias vector marked as optional. However, TOCO has a
     // dedicated pass, EnsureBiasVectors, to create default bias vectors for all
@@ -243,7 +258,6 @@ class ConvertTFConvOp : public RewritePattern {
     auto bias =
         rewriter.create<TF::ConstOp>(op->getLoc(), bias_type, bias_attr);
 
-    auto input = tf_op.input();
     if (op->getAttrOfType<StringAttr>("padding").getValue() == "EXPLICIT") {
       // Add Const op for padding value.
       ArrayRef<Attribute> padding_attr_array =
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize.cc b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
index 0e74881f925a56..1903024c887083 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
@@ -122,7 +122,7 @@ struct TFLQuantizationBase
     // it.
 
     return quantization_trait == kDynamicRangeQuantization &&
-           quantized_op->hasTrait<OpTrait::quant::DynamicRangeQuantizableOp>();
+           dyn_cast<DynamicRangeQuantizedOpInterface>(quantized_op);
   }
 
   static bool AllowHybridResult(Operation* quantized_op) {
@@ -130,7 +130,7 @@ struct TFLQuantizationBase
     // supports it.
 
     return quantization_trait == kDynamicRangeQuantization &&
-           quantized_op->hasTrait<OpTrait::quant::DynamicRangeQuantizableOp>();
+           dyn_cast<DynamicRangeQuantizedOpInterface>(quantized_op);
   }
 };
 
diff --git a/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc b/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
index 76381974162450..b3be668395c64d 100644
--- a/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/while_loop_outline.cc
@@ -93,9 +93,10 @@ bool IsCompatibleTypeWithTFLCastOp(Type type) {
   // F32 and BF16 types are allowed.
   if (elemType.isBF16() || elemType.isF32()) return true;
 
-  // I1, I16, I32, I64 types are allowed.
-  if (elemType.isInteger(1) || elemType.isInteger(16) ||
-      elemType.isInteger(32) || elemType.isInteger(64))
+  // I1, I8 I16, I32, I64 types are allowed.
+  if (elemType.isInteger(1) || elemType.isInteger(8) ||
+      elemType.isInteger(16) || elemType.isInteger(32) ||
+      elemType.isInteger(64))
     return true;
 
   // Complex<F<32>> is allowed.
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 23ce5f8df88da0..cf72f05edf4bfb 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -387,6 +387,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/ir:Dialect",
+        "//tensorflow/core/transforms:TopoSortPass",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -1248,6 +1249,7 @@ cc_library(
         ":tensorflow_analysis",
         ":tensorflow_ops",
         ":tensorflow_optimize_inc_gen",
+        ":tensorflow_side_effects",
         ":tensorflow_types",
         ":tf_data_optimization",
         ":tf_legalize_hlo",
@@ -2264,6 +2266,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
+        "//tensorflow/core/ir/importexport:import",
         "//tensorflow/core/platform:logging",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AllPassesAndDialects",
@@ -2277,6 +2280,9 @@ tf_cc_test(
     name = "dump_graph_test",
     size = "small",
     srcs = ["utils/dump_graph_test.cc"],
+    tags = [
+        "no_windows",  # b/208469759
+    ],
     deps = [
         ":dump_graph",
         "//tensorflow/core:framework",
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
index d1806a59499038..9ef779e0aab620 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -155,11 +156,7 @@ SideEffects GetSideEffectsFromEffectInstance(
     const MemoryEffects::EffectInstance& effect_instance, Operation* op) {
   mlir::SideEffects::Effect* effect = effect_instance.getEffect();
   SideEffects side_effects;
-  if (llvm::isa<ResourceEffects::TPUEmbedding>(effect_instance.getResource())) {
-    // TODO(mgester) This hack can be removed once b/196857154 is fixed.
-    // See definition of `TF_TPUEmbeddingSideEffect` for more details.
-    side_effects.SetRead();
-  } else if (isa<MemoryEffects::Allocate>(effect)) {
+  if (isa<MemoryEffects::Allocate>(effect)) {
     side_effects.SetAlloc();
   } else if (isa<MemoryEffects::Free>(effect)) {
     side_effects.SetFree();
@@ -356,11 +353,20 @@ class OpSideEffectCollector {
         // We handle value-based side effects for which we can use resource
         // alias analysis at a different place, skip here.
         if (ShouldUseResourceAliasAnalysis(effect)) continue;
+        if (llvm::isa<ResourceEffects::MustExecute>(effect.getResource()))
+          // We have this fake resource to avoid that certain ops are considered
+          // dead or get pruned, ignore it for side effect analysis.
+          continue;
 
         // Add side effects for op resource ID.
+        int64_t instance_id = -1;
         SideEffects side_effects(GetSideEffectsFromEffectInstance(effect, op));
+        if (auto resource_instance_op =
+            dyn_cast<GetResourceInstanceInterface>(op)) {
+          instance_id = resource_instance_op.GetResourceInstanceId();
+        }
         ResourceId resource_id =
-            GetOpResourceId(effect.getResource()->getResourceID());
+            GetOpResourceId(effect.getResource()->getResourceID(), instance_id);
         side_effects.SetResourceId(resource_id);
         UpdateSideEffectsByResourceId(side_effects,
                                       side_effects_by_resource_id);
@@ -368,10 +374,11 @@ class OpSideEffectCollector {
     }
   }
 
-  // Get internal op resource ID from MLIR type ID.
-  ResourceId GetOpResourceId(TypeID type_id) {
+  // Get internal op resource ID from MLIR type ID and instance ID.
+  ResourceId GetOpResourceId(TypeID type_id, int64_t instance_id) {
     auto emplace_result =
-        type_id_to_op_resource_id_.try_emplace(type_id, next_op_resource_id_);
+        type_instance_ids_to_op_resource_id_.try_emplace(
+            std::make_pair(type_id, instance_id), next_op_resource_id_);
     // Increment type ID if we have encountered a new resource type.
     if (emplace_result.second) ++next_op_resource_id_;
     return emplace_result.first->getSecond();
@@ -385,9 +392,10 @@ class OpSideEffectCollector {
   // Next available ID for op-based resources (resources not handled by resource
   // alias analysis).
   ResourceId next_op_resource_id_ = kMaxResourceId + 1;
-  // Maps MLIR type IDs for resource types to internal IDs for op-based
-  // resources. Also see comment above.
-  llvm::SmallDenseMap<TypeID, ResourceId> type_id_to_op_resource_id_;
+  // Maps (type ID, instance ID) pairs to internal IDs for op-based resources.
+  // Also see comment above.
+  llvm::SmallDenseMap<std::pair<TypeID, int64_t>, ResourceId>
+    type_instance_ids_to_op_resource_id_;
   // Used for faster callable resolution.
   SymbolTableCollection symbol_table_collection_;
   // Collect all op-based side effects here.
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 384597d3c71238..6fd9e285d5345b 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -4065,7 +4065,7 @@ This operation creates a tensor of `shape` and `dtype`.
   let hasFolder = 1;
 }
 
-def TF_EnqueueTPUEmbeddingArbitraryTensorBatchOp : TF_Op<"EnqueueTPUEmbeddingArbitraryTensorBatch", [SameVariadicOperandSize, TF_TPUEmbeddingSideEffect]> {
+def TF_EnqueueTPUEmbeddingArbitraryTensorBatchOp : TF_Op<"EnqueueTPUEmbeddingArbitraryTensorBatch", [DeclareOpInterfaceMethods<TF_GetResourceInstanceInterface>, SameVariadicOperandSize, TF_TPUEmbeddingWriteEffect]> {
   let summary = [{
 Eases the porting of code that uses tf.nn.embedding_lookup_sparse().
   }];
@@ -4117,7 +4117,7 @@ in TPUEmbeddingConfiguration is used, otherwise mode_override is used.}]>:$mode_
   TF_DerivedOperandTypeAttr T3 = TF_DerivedOperandTypeAttr<2>;
 }
 
-def TF_EnqueueTPUEmbeddingBatchOp : TF_Op<"EnqueueTPUEmbeddingBatch", [TF_TPUEmbeddingSideEffect]> {
+def TF_EnqueueTPUEmbeddingBatchOp : TF_Op<"EnqueueTPUEmbeddingBatch", [DeclareOpInterfaceMethods<TF_GetResourceInstanceInterface>, TF_TPUEmbeddingWriteEffect]> {
   let summary = [{
 An op that enqueues a list of input batch tensors to TPUEmbedding.
   }];
@@ -4141,7 +4141,7 @@ in TPUEmbeddingConfiguration is used, otherwise mode_override is used.}]>:$mode_
   TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<0>;
 }
 
-def TF_EnqueueTPUEmbeddingIntegerBatchOp : TF_Op<"EnqueueTPUEmbeddingIntegerBatch", [TF_TPUEmbeddingSideEffect]> {
+def TF_EnqueueTPUEmbeddingIntegerBatchOp : TF_Op<"EnqueueTPUEmbeddingIntegerBatch", [DeclareOpInterfaceMethods<TF_GetResourceInstanceInterface>, TF_TPUEmbeddingWriteEffect]> {
   let summary = [{
 An op that enqueues a list of input batch tensors to TPUEmbedding.
   }];
@@ -4162,7 +4162,7 @@ in TPUEmbeddingConfiguration is used, otherwise mode_override is used.}]>:$mode_
   TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<0>;
 }
 
-def TF_EnqueueTPUEmbeddingRaggedTensorBatchOp : TF_Op<"EnqueueTPUEmbeddingRaggedTensorBatch", [SameVariadicOperandSize, TF_TPUEmbeddingSideEffect]> {
+def TF_EnqueueTPUEmbeddingRaggedTensorBatchOp : TF_Op<"EnqueueTPUEmbeddingRaggedTensorBatch", [DeclareOpInterfaceMethods<TF_GetResourceInstanceInterface>, SameVariadicOperandSize, TF_TPUEmbeddingWriteEffect]> {
   let summary = "Eases the porting of code that uses tf.nn.embedding_lookup().";
 
   let description = [{
@@ -4207,7 +4207,7 @@ in TPUEmbeddingConfiguration is used, otherwise mode_override is used.}]>:$mode_
   TF_DerivedOperandTypeAttr T3 = TF_DerivedOperandTypeAttr<2>;
 }
 
-def TF_EnqueueTPUEmbeddingSparseBatchOp : TF_Op<"EnqueueTPUEmbeddingSparseBatch", [SameVariadicOperandSize, TF_TPUEmbeddingSideEffect]> {
+def TF_EnqueueTPUEmbeddingSparseBatchOp : TF_Op<"EnqueueTPUEmbeddingSparseBatch", [DeclareOpInterfaceMethods<TF_GetResourceInstanceInterface>, SameVariadicOperandSize, TF_TPUEmbeddingWriteEffect]> {
   let summary = [{
 An op that enqueues TPUEmbedding input indices from a SparseTensor.
   }];
@@ -4250,7 +4250,7 @@ in TPUEmbeddingConfiguration is used, otherwise mode_override is used.}]>:$mode_
   TF_DerivedOperandTypeAttr T3 = TF_DerivedOperandTypeAttr<2>;
 }
 
-def TF_EnqueueTPUEmbeddingSparseTensorBatchOp : TF_Op<"EnqueueTPUEmbeddingSparseTensorBatch", [SameVariadicOperandSize, TF_TPUEmbeddingSideEffect]> {
+def TF_EnqueueTPUEmbeddingSparseTensorBatchOp : TF_Op<"EnqueueTPUEmbeddingSparseTensorBatch", [DeclareOpInterfaceMethods<TF_GetResourceInstanceInterface>, SameVariadicOperandSize, TF_TPUEmbeddingWriteEffect]> {
   let summary = [{
 Eases the porting of code that uses tf.nn.embedding_lookup_sparse().
   }];
@@ -6948,7 +6948,7 @@ idx ==> [1, 3, 5]
   TF_DerivedResultTypeAttr out_idx = TF_DerivedResultTypeAttr<1>;
 }
 
-def TF_LoadTPUEmbeddingADAMParametersOp : TF_Op<"LoadTPUEmbeddingADAMParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_LoadTPUEmbeddingADAMParametersOp : TF_Op<"LoadTPUEmbeddingADAMParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "Load ADAM embedding parameters.";
 
   let description = [{
@@ -6974,7 +6974,7 @@ executed.
   let results = (outs);
 }
 
-def TF_LoadTPUEmbeddingADAMParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddingADAMParametersGradAccumDebug", [TF_TPUEmbeddingSideEffect]> {
+def TF_LoadTPUEmbeddingADAMParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddingADAMParametersGradAccumDebug", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "";
 
   let arguments = (ins
@@ -6993,7 +6993,7 @@ def TF_LoadTPUEmbeddingADAMParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddingA
   let results = (outs);
 }
 
-def TF_LoadTPUEmbeddingAdadeltaParametersOp : TF_Op<"LoadTPUEmbeddingAdadeltaParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_LoadTPUEmbeddingAdadeltaParametersOp : TF_Op<"LoadTPUEmbeddingAdadeltaParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "Load Adadelta embedding parameters.";
 
   let description = [{
@@ -7019,7 +7019,7 @@ executed.
   let results = (outs);
 }
 
-def TF_LoadTPUEmbeddingAdadeltaParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddingAdadeltaParametersGradAccumDebug", [TF_TPUEmbeddingSideEffect]> {
+def TF_LoadTPUEmbeddingAdadeltaParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddingAdadeltaParametersGradAccumDebug", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "";
 
   let arguments = (ins
@@ -7038,7 +7038,7 @@ def TF_LoadTPUEmbeddingAdadeltaParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbedd
   let results = (outs);
 }
 
-def TF_LoadTPUEmbeddingAdagradParametersOp : TF_Op<"LoadTPUEmbeddingAdagradParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_LoadTPUEmbeddingAdagradParametersOp : TF_Op<"LoadTPUEmbeddingAdagradParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "Load Adagrad embedding parameters.";
 
   let description = [{
@@ -7063,7 +7063,7 @@ executed.
   let results = (outs);
 }
 
-def TF_LoadTPUEmbeddingAdagradParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddingAdagradParametersGradAccumDebug", [TF_TPUEmbeddingSideEffect]> {
+def TF_LoadTPUEmbeddingAdagradParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddingAdagradParametersGradAccumDebug", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "";
 
   let arguments = (ins
@@ -7081,7 +7081,7 @@ def TF_LoadTPUEmbeddingAdagradParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddi
   let results = (outs);
 }
 
-def TF_LoadTPUEmbeddingCenteredRMSPropParametersOp : TF_Op<"LoadTPUEmbeddingCenteredRMSPropParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_LoadTPUEmbeddingCenteredRMSPropParametersOp : TF_Op<"LoadTPUEmbeddingCenteredRMSPropParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "Load centered RMSProp embedding parameters.";
 
   let description = [{
@@ -7108,7 +7108,7 @@ executed.
   let results = (outs);
 }
 
-def TF_LoadTPUEmbeddingFTRLParametersOp : TF_Op<"LoadTPUEmbeddingFTRLParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_LoadTPUEmbeddingFTRLParametersOp : TF_Op<"LoadTPUEmbeddingFTRLParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "Load FTRL embedding parameters.";
 
   let description = [{
@@ -7134,7 +7134,7 @@ executed.
   let results = (outs);
 }
 
-def TF_LoadTPUEmbeddingFTRLParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddingFTRLParametersGradAccumDebug", [TF_TPUEmbeddingSideEffect]> {
+def TF_LoadTPUEmbeddingFTRLParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddingFTRLParametersGradAccumDebug", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "";
 
   let arguments = (ins
@@ -7153,7 +7153,7 @@ def TF_LoadTPUEmbeddingFTRLParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddingF
   let results = (outs);
 }
 
-def TF_LoadTPUEmbeddingMDLAdagradLightParametersOp : TF_Op<"LoadTPUEmbeddingMDLAdagradLightParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_LoadTPUEmbeddingMDLAdagradLightParametersOp : TF_Op<"LoadTPUEmbeddingMDLAdagradLightParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "Load MDL Adagrad Light embedding parameters.";
 
   let description = [{
@@ -7180,7 +7180,7 @@ executed.
   let results = (outs);
 }
 
-def TF_LoadTPUEmbeddingMomentumParametersOp : TF_Op<"LoadTPUEmbeddingMomentumParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_LoadTPUEmbeddingMomentumParametersOp : TF_Op<"LoadTPUEmbeddingMomentumParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "Load Momentum embedding parameters.";
 
   let description = [{
@@ -7205,7 +7205,7 @@ executed.
   let results = (outs);
 }
 
-def TF_LoadTPUEmbeddingMomentumParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddingMomentumParametersGradAccumDebug", [TF_TPUEmbeddingSideEffect]> {
+def TF_LoadTPUEmbeddingMomentumParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddingMomentumParametersGradAccumDebug", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "";
 
   let arguments = (ins
@@ -7223,7 +7223,7 @@ def TF_LoadTPUEmbeddingMomentumParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbedd
   let results = (outs);
 }
 
-def TF_LoadTPUEmbeddingProximalAdagradParametersOp : TF_Op<"LoadTPUEmbeddingProximalAdagradParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_LoadTPUEmbeddingProximalAdagradParametersOp : TF_Op<"LoadTPUEmbeddingProximalAdagradParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "Load proximal Adagrad embedding parameters.";
 
   let description = [{
@@ -7248,7 +7248,7 @@ executed.
   let results = (outs);
 }
 
-def TF_LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug", [TF_TPUEmbeddingSideEffect]> {
+def TF_LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddingProximalAdagradParametersGradAccumDebug", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "";
 
   let arguments = (ins
@@ -7266,7 +7266,7 @@ def TF_LoadTPUEmbeddingProximalAdagradParametersGradAccumDebugOp : TF_Op<"LoadTP
   let results = (outs);
 }
 
-def TF_LoadTPUEmbeddingProximalYogiParametersOp : TF_Op<"LoadTPUEmbeddingProximalYogiParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_LoadTPUEmbeddingProximalYogiParametersOp : TF_Op<"LoadTPUEmbeddingProximalYogiParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "";
 
   let arguments = (ins
@@ -7284,7 +7284,7 @@ def TF_LoadTPUEmbeddingProximalYogiParametersOp : TF_Op<"LoadTPUEmbeddingProxima
   let results = (outs);
 }
 
-def TF_LoadTPUEmbeddingProximalYogiParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddingProximalYogiParametersGradAccumDebug", [TF_TPUEmbeddingSideEffect]> {
+def TF_LoadTPUEmbeddingProximalYogiParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddingProximalYogiParametersGradAccumDebug", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "";
 
   let arguments = (ins
@@ -7303,7 +7303,7 @@ def TF_LoadTPUEmbeddingProximalYogiParametersGradAccumDebugOp : TF_Op<"LoadTPUEm
   let results = (outs);
 }
 
-def TF_LoadTPUEmbeddingRMSPropParametersOp : TF_Op<"LoadTPUEmbeddingRMSPropParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_LoadTPUEmbeddingRMSPropParametersOp : TF_Op<"LoadTPUEmbeddingRMSPropParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "Load RMSProp embedding parameters.";
 
   let description = [{
@@ -7329,7 +7329,7 @@ executed.
   let results = (outs);
 }
 
-def TF_LoadTPUEmbeddingRMSPropParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddingRMSPropParametersGradAccumDebug", [TF_TPUEmbeddingSideEffect]> {
+def TF_LoadTPUEmbeddingRMSPropParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddingRMSPropParametersGradAccumDebug", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "";
 
   let arguments = (ins
@@ -7348,7 +7348,7 @@ def TF_LoadTPUEmbeddingRMSPropParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddi
   let results = (outs);
 }
 
-def TF_LoadTPUEmbeddingStochasticGradientDescentParametersOp : TF_Op<"LoadTPUEmbeddingStochasticGradientDescentParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_LoadTPUEmbeddingStochasticGradientDescentParametersOp : TF_Op<"LoadTPUEmbeddingStochasticGradientDescentParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "Load SGD embedding parameters.";
 
   let description = [{
@@ -7372,7 +7372,7 @@ executed.
   let results = (outs);
 }
 
-def TF_LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug", [TF_TPUEmbeddingSideEffect]> {
+def TF_LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugOp : TF_Op<"LoadTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "";
 
   let arguments = (ins
@@ -8717,15 +8717,15 @@ tf.matmul(a, x)
   }];
 
   let arguments = (ins
-    Arg<TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>, [{Shape is `[..., M, M]`.}]>:$matrix,
-    Arg<TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>, [{Shape is `[..., M, K]`.}]>:$rhs,
+    Arg<TF_FpOrComplexTensor, [{Shape is `[..., M, M]`.}]>:$matrix,
+    Arg<TF_FpOrComplexTensor, [{Shape is `[..., M, K]`.}]>:$rhs,
 
     DefaultValuedAttr<BoolAttr, "true">:$lower,
     DefaultValuedAttr<BoolAttr, "false">:$adjoint
   );
 
   let results = (outs
-    Res<TensorOf<[TF_Complex128, TF_Complex64, TF_Float16, TF_Float32, TF_Float64]>, [{Shape is `[..., M, K]`.}]>:$output
+    Res<TF_FpOrComplexTensor, [{Shape is `[..., M, K]`.}]>:$output
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
@@ -11548,7 +11548,7 @@ def TF_RecvOp : TF_Op<"Recv", []> {
   TF_DerivedResultTypeAttr tensor_type = TF_DerivedResultTypeAttr<0>;
 }
 
-def TF_RecvTPUEmbeddingActivationsOp : TF_Op<"RecvTPUEmbeddingActivations", [TF_TPUEmbeddingSideEffect]> {
+def TF_RecvTPUEmbeddingActivationsOp : TF_Op<"RecvTPUEmbeddingActivations", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "An op that receives embedding activations on the TPU.";
 
   let description = [{
@@ -12939,7 +12939,7 @@ checkpoint directly.}]>:$tensors
   TF_DerivedResultTypeListAttr dtypes = TF_DerivedResultTypeListAttr<0>;
 }
 
-def TF_RetrieveTPUEmbeddingADAMParametersOp : TF_Op<"RetrieveTPUEmbeddingADAMParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_RetrieveTPUEmbeddingADAMParametersOp : TF_Op<"RetrieveTPUEmbeddingADAMParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "Retrieve ADAM embedding parameters.";
 
   let description = [{
@@ -12964,7 +12964,7 @@ used to retrieve updated parameters before saving a checkpoint.
   );
 }
 
-def TF_RetrieveTPUEmbeddingADAMParametersGradAccumDebugOp : TF_Op<"RetrieveTPUEmbeddingADAMParametersGradAccumDebug", [TF_TPUEmbeddingSideEffect]> {
+def TF_RetrieveTPUEmbeddingADAMParametersGradAccumDebugOp : TF_Op<"RetrieveTPUEmbeddingADAMParametersGradAccumDebug", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "";
 
   let arguments = (ins
@@ -12983,7 +12983,7 @@ def TF_RetrieveTPUEmbeddingADAMParametersGradAccumDebugOp : TF_Op<"RetrieveTPUEm
   );
 }
 
-def TF_RetrieveTPUEmbeddingAdadeltaParametersOp : TF_Op<"RetrieveTPUEmbeddingAdadeltaParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_RetrieveTPUEmbeddingAdadeltaParametersOp : TF_Op<"RetrieveTPUEmbeddingAdadeltaParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "Retrieve Adadelta embedding parameters.";
 
   let description = [{
@@ -13008,7 +13008,7 @@ used to retrieve updated parameters before saving a checkpoint.
   );
 }
 
-def TF_RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugOp : TF_Op<"RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug", [TF_TPUEmbeddingSideEffect]> {
+def TF_RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugOp : TF_Op<"RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebug", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "";
 
   let arguments = (ins
@@ -13027,7 +13027,7 @@ def TF_RetrieveTPUEmbeddingAdadeltaParametersGradAccumDebugOp : TF_Op<"RetrieveT
   );
 }
 
-def TF_RetrieveTPUEmbeddingAdagradParametersOp : TF_Op<"RetrieveTPUEmbeddingAdagradParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_RetrieveTPUEmbeddingAdagradParametersOp : TF_Op<"RetrieveTPUEmbeddingAdagradParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "Retrieve Adagrad embedding parameters.";
 
   let description = [{
@@ -13051,7 +13051,7 @@ used to retrieve updated parameters before saving a checkpoint.
   );
 }
 
-def TF_RetrieveTPUEmbeddingAdagradParametersGradAccumDebugOp : TF_Op<"RetrieveTPUEmbeddingAdagradParametersGradAccumDebug", [TF_TPUEmbeddingSideEffect]> {
+def TF_RetrieveTPUEmbeddingAdagradParametersGradAccumDebugOp : TF_Op<"RetrieveTPUEmbeddingAdagradParametersGradAccumDebug", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "";
 
   let arguments = (ins
@@ -13069,7 +13069,7 @@ def TF_RetrieveTPUEmbeddingAdagradParametersGradAccumDebugOp : TF_Op<"RetrieveTP
   );
 }
 
-def TF_RetrieveTPUEmbeddingCenteredRMSPropParametersOp : TF_Op<"RetrieveTPUEmbeddingCenteredRMSPropParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_RetrieveTPUEmbeddingCenteredRMSPropParametersOp : TF_Op<"RetrieveTPUEmbeddingCenteredRMSPropParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "Retrieve centered RMSProp embedding parameters.";
 
   let description = [{
@@ -13095,7 +13095,7 @@ used to retrieve updated parameters before saving a checkpoint.
   );
 }
 
-def TF_RetrieveTPUEmbeddingFTRLParametersOp : TF_Op<"RetrieveTPUEmbeddingFTRLParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_RetrieveTPUEmbeddingFTRLParametersOp : TF_Op<"RetrieveTPUEmbeddingFTRLParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "Retrieve FTRL embedding parameters.";
 
   let description = [{
@@ -13120,7 +13120,7 @@ used to retrieve updated parameters before saving a checkpoint.
   );
 }
 
-def TF_RetrieveTPUEmbeddingFTRLParametersGradAccumDebugOp : TF_Op<"RetrieveTPUEmbeddingFTRLParametersGradAccumDebug", [TF_TPUEmbeddingSideEffect]> {
+def TF_RetrieveTPUEmbeddingFTRLParametersGradAccumDebugOp : TF_Op<"RetrieveTPUEmbeddingFTRLParametersGradAccumDebug", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "";
 
   let arguments = (ins
@@ -13139,7 +13139,7 @@ def TF_RetrieveTPUEmbeddingFTRLParametersGradAccumDebugOp : TF_Op<"RetrieveTPUEm
   );
 }
 
-def TF_RetrieveTPUEmbeddingMDLAdagradLightParametersOp : TF_Op<"RetrieveTPUEmbeddingMDLAdagradLightParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_RetrieveTPUEmbeddingMDLAdagradLightParametersOp : TF_Op<"RetrieveTPUEmbeddingMDLAdagradLightParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "Retrieve MDL Adagrad Light embedding parameters.";
 
   let description = [{
@@ -13165,7 +13165,7 @@ used to retrieve updated parameters before saving a checkpoint.
   );
 }
 
-def TF_RetrieveTPUEmbeddingMomentumParametersOp : TF_Op<"RetrieveTPUEmbeddingMomentumParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_RetrieveTPUEmbeddingMomentumParametersOp : TF_Op<"RetrieveTPUEmbeddingMomentumParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "Retrieve Momentum embedding parameters.";
 
   let description = [{
@@ -13189,7 +13189,7 @@ used to retrieve updated parameters before saving a checkpoint.
   );
 }
 
-def TF_RetrieveTPUEmbeddingMomentumParametersGradAccumDebugOp : TF_Op<"RetrieveTPUEmbeddingMomentumParametersGradAccumDebug", [TF_TPUEmbeddingSideEffect]> {
+def TF_RetrieveTPUEmbeddingMomentumParametersGradAccumDebugOp : TF_Op<"RetrieveTPUEmbeddingMomentumParametersGradAccumDebug", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "";
 
   let arguments = (ins
@@ -13207,7 +13207,7 @@ def TF_RetrieveTPUEmbeddingMomentumParametersGradAccumDebugOp : TF_Op<"RetrieveT
   );
 }
 
-def TF_RetrieveTPUEmbeddingProximalAdagradParametersOp : TF_Op<"RetrieveTPUEmbeddingProximalAdagradParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_RetrieveTPUEmbeddingProximalAdagradParametersOp : TF_Op<"RetrieveTPUEmbeddingProximalAdagradParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "Retrieve proximal Adagrad embedding parameters.";
 
   let description = [{
@@ -13231,7 +13231,7 @@ used to retrieve updated parameters before saving a checkpoint.
   );
 }
 
-def TF_RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugOp : TF_Op<"RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug", [TF_TPUEmbeddingSideEffect]> {
+def TF_RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugOp : TF_Op<"RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebug", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "";
 
   let arguments = (ins
@@ -13249,7 +13249,7 @@ def TF_RetrieveTPUEmbeddingProximalAdagradParametersGradAccumDebugOp : TF_Op<"Re
   );
 }
 
-def TF_RetrieveTPUEmbeddingProximalYogiParametersOp : TF_Op<"RetrieveTPUEmbeddingProximalYogiParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_RetrieveTPUEmbeddingProximalYogiParametersOp : TF_Op<"RetrieveTPUEmbeddingProximalYogiParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "";
 
   let arguments = (ins
@@ -13267,7 +13267,7 @@ def TF_RetrieveTPUEmbeddingProximalYogiParametersOp : TF_Op<"RetrieveTPUEmbeddin
   );
 }
 
-def TF_RetrieveTPUEmbeddingProximalYogiParametersGradAccumDebugOp : TF_Op<"RetrieveTPUEmbeddingProximalYogiParametersGradAccumDebug", [TF_TPUEmbeddingSideEffect]> {
+def TF_RetrieveTPUEmbeddingProximalYogiParametersGradAccumDebugOp : TF_Op<"RetrieveTPUEmbeddingProximalYogiParametersGradAccumDebug", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "";
 
   let arguments = (ins
@@ -13286,7 +13286,7 @@ def TF_RetrieveTPUEmbeddingProximalYogiParametersGradAccumDebugOp : TF_Op<"Retri
   );
 }
 
-def TF_RetrieveTPUEmbeddingRMSPropParametersOp : TF_Op<"RetrieveTPUEmbeddingRMSPropParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_RetrieveTPUEmbeddingRMSPropParametersOp : TF_Op<"RetrieveTPUEmbeddingRMSPropParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "Retrieve RMSProp embedding parameters.";
 
   let description = [{
@@ -13311,7 +13311,7 @@ used to retrieve updated parameters before saving a checkpoint.
   );
 }
 
-def TF_RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugOp : TF_Op<"RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug", [TF_TPUEmbeddingSideEffect]> {
+def TF_RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugOp : TF_Op<"RetrieveTPUEmbeddingRMSPropParametersGradAccumDebug", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "";
 
   let arguments = (ins
@@ -13330,7 +13330,7 @@ def TF_RetrieveTPUEmbeddingRMSPropParametersGradAccumDebugOp : TF_Op<"RetrieveTP
   );
 }
 
-def TF_RetrieveTPUEmbeddingStochasticGradientDescentParametersOp : TF_Op<"RetrieveTPUEmbeddingStochasticGradientDescentParameters", [TF_TPUEmbeddingSideEffect]> {
+def TF_RetrieveTPUEmbeddingStochasticGradientDescentParametersOp : TF_Op<"RetrieveTPUEmbeddingStochasticGradientDescentParameters", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "Retrieve SGD embedding parameters.";
 
   let description = [{
@@ -13353,7 +13353,7 @@ used to retrieve updated parameters before saving a checkpoint.
   );
 }
 
-def TF_RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugOp : TF_Op<"RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug", [TF_TPUEmbeddingSideEffect]> {
+def TF_RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebugOp : TF_Op<"RetrieveTPUEmbeddingStochasticGradientDescentParametersGradAccumDebug", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "";
 
   let arguments = (ins
@@ -14407,7 +14407,7 @@ def TF_SendOp : TF_Op<"Send", []> {
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_SendTPUEmbeddingGradientsOp : TF_Op<"SendTPUEmbeddingGradients", [AttrSizedOperandSegments, TF_TPUEmbeddingSideEffect]> {
+def TF_SendTPUEmbeddingGradientsOp : TF_Op<"SendTPUEmbeddingGradients", [AttrSizedOperandSegments, TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "Performs gradient updates of embedding tables.";
 
   let arguments = (ins
@@ -19820,6 +19820,10 @@ XlaVariadicReduceV2 is a version that supports heterogeneous operands.
 
   TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<0>;
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+
+  let verifier = [{ return Verify(*this); }];
+
+  let hasCanonicalizer = 1;
 }
 
 def TF_XlaVariadicReduceV2Op : TF_Op<"XlaVariadicReduceV2", [AttrSizedOperandSegments, NoSideEffect]> {
@@ -20169,7 +20173,7 @@ def TF__ListToArrayOp : TF_Op<"_ListToArray", [NoSideEffect]> {
   TF_DerivedResultTypeAttr T = TF_DerivedResultTypeAttr<0>;
 }
 
-def TF__RecvTPUEmbeddingActivationsOp : TF_Op<"_RecvTPUEmbeddingActivations", [TF_TPUEmbeddingSideEffect]> {
+def TF__RecvTPUEmbeddingActivationsOp : TF_Op<"_RecvTPUEmbeddingActivations", [TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "An op that receives embeddng activations on the TPU.";
 
   let description = [{
@@ -20221,7 +20225,7 @@ of the embedding lookup operation.
   );
 }
 
-def TF__SendTPUEmbeddingGradientsOp : TF_Op<"_SendTPUEmbeddingGradients", [AttrSizedOperandSegments, TF_TPUEmbeddingSideEffect]> {
+def TF__SendTPUEmbeddingGradientsOp : TF_Op<"_SendTPUEmbeddingGradients", [AttrSizedOperandSegments, TF_MustExecute, TF_TPUEmbeddingReadEffect]> {
   let summary = "An op that performs gradient updates of embedding tables.";
 
   let description = [{
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index 2bd55f200fa88c..020ffdc84be4b4 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -165,6 +165,8 @@ def TF_TPUEmbeddingResource : TF_ResourceBase<"TPUEmbedding">;
 def TF_GeneratorOpResource : TF_ResourceBase<"GeneratorOp">;
 def TF_SendRecvResource : TF_ResourceBase<"SendRecv">;
 def TF_TPUCompileExecuteResource : TF_ResourceBase<"TPUCompileExecute">;
+// Fake resource, see `TF_MustExecute` below.
+def TF_MustExecuteResource : TF_ResourceBase<"MustExecute">;
 
 // Value-based side effects
 //
@@ -214,17 +216,21 @@ def TF_DatasetIteratorFree : MemFree<TF_DatasetIteratorResource>;
 // effecting ops. Note that for `read` effects ops might be pruned if nothing
 // depends on them.
 def TF_GeneratorOpSideEffect : MemoryEffects<[MemWrite<TF_GeneratorOpResource>]>;
-// Note: We actually want a `read` effect here but then some ops with this
-// effect are considered dead and are deleted which is not desired (see
-// b/195782952).
-// Therefore, we use a `write` effect + special handling in side effect
-// analysis. Once we have proper dependencies that avoid deletion (see
-// b/196857154), or once MLIR supports a trait to mark an op as not dead, this
-// hack can be removed.
-def TF_TPUEmbeddingSideEffect : MemoryEffects<[MemWrite<TF_TPUEmbeddingResource>]>;
+
+def TF_TPUEmbeddingWriteEffect : MemoryEffects<[MemWrite<TF_TPUEmbeddingResource>]>;
+def TF_TPUEmbeddingReadEffect : MemoryEffects<[MemRead<TF_TPUEmbeddingResource>]>;
+
 def TF_SendRecvSideEffect : MemoryEffects<[MemWrite<TF_SendRecvResource>]>;
 def TF_TPUCompileExecuteSideEffect : MemoryEffects<[MemWrite<TF_TPUCompileExecuteResource>]>;
 
+// Trait for enforcing that a side-effecting op is executed, even if it would be
+// considered dead by MLIR (see b/195782952).
+// The trait is implemented as a write effect for a fake resource which is
+// ignored by side effect analysis, so it does not affect execution order
+// constraints and control dependencies at all (for example, multiple ops with
+// this trait do not have to execute in order).
+def TF_MustExecute : MemoryEffects<[MemWrite<TF_MustExecuteResource>]>;
+
 //===----------------------------------------------------------------------===//
 // TensorFlow op definitions
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
index bbc4638627e842..bf1ed851f027aa 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td
@@ -131,4 +131,23 @@ def TF_ResourceHandleAllocatorInterface : OpInterface<"ResourceHandleAllocatorIn
   ];
 }
 
+def TF_GetResourceInstanceInterface : OpInterface<"GetResourceInstanceInterface"> {
+  let description = [{Returns an integer corresponding to the resource instance
+                      accessed by this op}];
+
+  let methods = [
+    InterfaceMethod<
+      /*desc=*/[{Returns an integer corresponding to the resource instance
+                 accessed by this op. The implementation must guarantee that the
+                 mapping between resource instances and integers is bijective,
+                 i.e., two op instances should return the same integer if and
+                 only if they access the same resource. The interface should
+                 only be used for ops that access exactly one resource.}],
+      /*retTy=*/"int64_t",
+      /*methodName=*/"GetResourceInstanceId",
+      /*args=*/(ins)
+    >,
+  ];
+}
+
 #endif // TF_OP_INTERFACES
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index 71334414b0b122..8e19bc29c5e98d 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -1564,6 +1564,13 @@ def TF__InternalTestNonResourceValueSideEffects_ : TF_Op<"_InternalTestNonResour
   let results = (outs);
 }
 
+def TF__InternalTestMustExecuteTrait_ : TF_Op<"_InternalTestMustExecuteTrait_", [TF_MustExecute]> {
+  let summary = "Internal op for testing only";
+
+  let arguments = (ins);
+  let results = (outs);
+}
+
 def TF_SetStaticDimensionBoundsOp : TF_Op<"SetStaticDimensionBounds", []> {
   let summary = "Op used to indicate to the compiler and runtime the static bounds of a tensor.";
   let description = [{
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
index a2228cbcb6da8d..ff37408cb80893 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
@@ -2299,6 +2299,37 @@ static LogicalResult Verify(EmptyTensorListOp op) {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// EnqueueTPUEmbedding ops
+//===----------------------------------------------------------------------===//
+
+// For EnqueueTPUEmbedding ops the device ordinal corresponds to the resource
+// instance.
+
+int64_t EnqueueTPUEmbeddingArbitraryTensorBatchOp::GetResourceInstanceId() {
+  return device_ordinal();
+}
+
+int64_t EnqueueTPUEmbeddingBatchOp::GetResourceInstanceId() {
+  return device_ordinal();
+}
+
+int64_t EnqueueTPUEmbeddingIntegerBatchOp::GetResourceInstanceId() {
+  return device_ordinal();
+}
+
+int64_t EnqueueTPUEmbeddingRaggedTensorBatchOp::GetResourceInstanceId() {
+  return device_ordinal();
+}
+
+int64_t EnqueueTPUEmbeddingSparseBatchOp::GetResourceInstanceId() {
+  return device_ordinal();
+}
+
+int64_t EnqueueTPUEmbeddingSparseTensorBatchOp::GetResourceInstanceId() {
+  return device_ordinal();
+}
+
 //===----------------------------------------------------------------------===//
 // EnsureShapeOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
index 1363cc5d3fc72e..853cfaa9c7202d 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
@@ -2171,12 +2171,7 @@ SummaryWriterOp::GetResourceHandleValueAndIdList(
 void TPUExecuteOp::getEffects(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
         &effects) {
-  effects.reserve(args().size() + 2);
-
-  // There may be some TPU Embedding ops in the computation, so this effect is
-  // added conservatively.
-  effects.emplace_back(MemoryEffects::Write::get(),
-                       ResourceEffects::TPUEmbedding::get());
+  effects.reserve(args().size() + 1);
   effects.emplace_back(MemoryEffects::Write::get(),
                        ResourceEffects::TPUCompileExecute::get());
 
@@ -2239,12 +2234,7 @@ static LogicalResult Verify(TPUExecuteAndUpdateVariablesOp op) {
 void TPUExecuteAndUpdateVariablesOp::getEffects(
     SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
         &effects) {
-  effects.reserve(device_var_reads_indices().size() + 2);
-
-  // There may be some TPU Embedding ops in the computation, so this effect is
-  // added conservatively.
-  effects.emplace_back(MemoryEffects::Write::get(),
-                       ResourceEffects::TPUEmbedding::get());
+  effects.reserve(device_var_reads_indices().size() + 1);
   effects.emplace_back(MemoryEffects::Write::get(),
                        ResourceEffects::TPUCompileExecute::get());
   auto resource_handles = llvm::make_filter_range(args(), [](Value value) {
@@ -3321,6 +3311,45 @@ LogicalResult XlaSetDynamicDimensionSizeOp::inferReturnTypes(
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// XlaVariadicReduceOp
+//===----------------------------------------------------------------------===//
+//
+
+static LogicalResult Verify(XlaVariadicReduceOp op) {
+  // We rely on V2 for the majority of the checks.
+  const auto &input_ty = op.input().getType();
+  if (input_ty.empty()) return op.emitOpError() << "No input";
+  const auto &dtype = input_ty[0].cast<TensorType>().getElementType();
+  for (const auto &ty : input_ty) {
+    if (ty.cast<TensorType>().getElementType() != dtype)
+      return op.emitOpError()
+             << "This version is limited to operands of the same dtype";
+  }
+  return success();
+}
+
+class XlaVariadicReduceToV2 : public OpRewritePattern<TF::XlaVariadicReduceOp> {
+ public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TF::XlaVariadicReduceOp op,
+                                PatternRewriter &rewriter) const override {
+    mlir::TF::XlaVariadicReduceV2Op xla_variadic_reduce_v2_op =
+        rewriter.create<::mlir::TF::XlaVariadicReduceV2Op>(
+            op.getLoc(), op.getResults().getTypes(), op.input(),
+            op.init_value(), op.dimensions_to_reduce(), op.reducer());
+
+    rewriter.replaceOp(op, xla_variadic_reduce_v2_op.getResults());
+    return ::mlir::success();
+  };
+};
+
+void XlaVariadicReduceOp::getCanonicalizationPatterns(
+    OwningRewritePatternList &results, MLIRContext *context) {
+  results.insert<XlaVariadicReduceToV2>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // XlaVariadicReduceV2Op
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
index 267978d6d024bc..ecc53b6efb9bc4 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h
@@ -77,6 +77,10 @@ struct TPUCompileExecute
   StringRef getName() final { return "<TPUCompileExecute>"; }
 };
 
+struct MustExecute : public ::mlir::SideEffects::Resource::Base<MustExecute> {
+  StringRef getName() final { return "<MustExecute>"; }
+};
+
 }  // namespace ResourceEffects
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index 23c523bae9690a..e0b70e7b3d8759 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -1978,3 +1978,14 @@ func @testComplexDivNoNanOpWithNonConstantY(%arg0: tensor<2xcomplex<f32>>, %arg1
   %res3 = "tf.MulNoNan"(%arg0, %noncon3) : (tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>>
   return %res1, %res2, %res3 : tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>, tensor<2xcomplex<f32>>
 }
+
+// CHECK-LABEL: testXlaVariadicReduceToV2
+func @testXlaVariadicReduceToV2(%arg0: tensor<3x4xf32>, %arg1: tensor<f32>) -> tensor<?x?xf32> attributes {tf.entry_function = {control_outputs = "", inputs = "_arg0,_arg1", outputs = "_retval0"}} {
+  // CHECK:  "tf.XlaVariadicReduceV2"(%arg0, %arg1) {dimensions_to_reduce = [], operand_segment_sizes = dense<1> : vector<2xi32>, reducer = @sum_reducer} : (tensor<3x4xf32>, tensor<f32>) -> tensor<?x?xf32>
+  %0 = "tf.XlaVariadicReduce"(%arg0, %arg1) {_XlaHasReferenceVars = false, device = "/job:localhost/replica:0/task:0/device:XLA_CPU:0", dimensions_to_reduce = [], reducer = @sum_reducer} : (tensor<3x4xf32>, tensor<f32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+func private @sum_reducer(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> attributes {tf._disable_call_shape_inference = true} {
+  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graph_pruning.mlir b/tensorflow/compiler/mlir/tensorflow/tests/graph_pruning.mlir
index 1f0a183c19e339..35feeeb378fa88 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graph_pruning.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graph_pruning.mlir
@@ -185,3 +185,22 @@ func @main() attributes {tf.entry_function = {control_outputs = "", inputs = "",
   }
   return
 }
+
+// -----
+
+// Check that an op with must-execute effect is not pruned, even if it is
+// unreachable.
+func @must_execute_op() -> () {
+// CHECK: tf_executor.graph
+// CHECK: tf_executor.island
+// CHECK: tf._InternalTestMustExecuteTrait_
+  tf_executor.graph {
+    %1 = tf_executor.island {
+      "tf._InternalTestMustExecuteTrait_"() : () -> ()
+      tf_executor.yield
+    }
+    tf_executor.fetch
+  }
+  return
+}
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
index 45f5e773c60a1b..25f295bb2fb5c9 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
@@ -1717,6 +1717,36 @@ func @convert_conv2d_explicit_padding(%arg0: tensor<64x8x8x8xf32>, %arg1: tensor
   return %0 : tensor<64x3x3x64xf32>
 }
 
+// CHECK-LABEL:   func @convert_conv2d_negative_explicit_padding(
+// CHECK-SAME:                         %[[ARG0:.*]]: tensor<128x7x9x64xf32>,
+// CHECK-SAME:                         %[[ARG1:.*]]: tensor<3x2x64x4xf32>) -> tensor<128x4x3x4xf32> {
+// CHECK:           %[[START:.*]] = "tf.Const"() {value = dense<[0, 0, 5, 0]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %[[SIZE:.*]] = "tf.Const"() {value = dense<[128, 5, 4, 64]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %[[SLICED_ARG0:.*]] = "tf.Slice"(%[[ARG0]], %[[START]], %[[SIZE]])
+// CHECK-SAME:      (tensor<128x7x9x64xf32>, tensor<4xi64>, tensor<4xi64>) -> tensor<128x5x4x64xf32>
+// CHECK:           %[[CONV:.*]] = "tf.Conv2D"(%[[SLICED_ARG0]], %[[ARG1]])
+// CHECK-SAME:      explicit_paddings = [0, 0, 4, 0, 0, 2, 0, 0]
+// CHECK-SAME:      (tensor<128x5x4x64xf32>, tensor<3x2x64x4xf32>) -> tensor<128x4x3x4xf32>
+// CHECK:           return %[[CONV]] : tensor<128x4x3x4xf32>
+// CHECK:         }
+func @convert_conv2d_negative_explicit_padding(%arg0: tensor<128x7x9x64xf32>, %arg1: tensor<3x2x64x4xf32>) -> tensor<128x4x3x4xf32> {
+  %0 = "mhlo.convolution"(%arg0, %arg1) {batch_group_count = 1 : i64,
+    dimension_numbers = #mhlo.conv<raw
+      input_batch_dimension = 0,
+      input_feature_dimension = 3,
+      input_spatial_dimensions = [1, 2],
+      kernel_input_feature_dimension = 2,
+      kernel_output_feature_dimension = 3,
+      kernel_spatial_dimensions = [0, 1],
+      output_batch_dimension = 0,
+      output_feature_dimension = 3,
+      output_spatial_dimensions = [1, 2]
+    >, feature_group_count = 1 : i64, lhs_dilation = dense<1> : tensor<2xi64>, padding = dense<[[4, -2], [-5, 2]]> : tensor<2x2xi64>,
+    precision_config = ["DEFAULT", "DEFAULT"], rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<2> : tensor<2xi64>
+  } : (tensor<128x7x9x64xf32>, tensor<3x2x64x4xf32>) -> tensor<128x4x3x4xf32>
+  return %0 : tensor<128x4x3x4xf32>
+}
+
 // CHECK-LABEL:   func @convert_depthwise_conv2d(
 // CHECK-SAME:                                   %[[VAL_0:.*]]: tensor<1x8x8x207xf32>,
 // CHECK-SAME:                                   %[[VAL_1:.*]]: tensor<3x3x1x3312xf32>) -> tensor<1x8x8x16xf32> {
@@ -1809,6 +1839,23 @@ func @convert_reduce_to_sum(%arg0: tensor<1x256xf32>) -> tensor<1xf32> {
   return %1 : tensor<1xf32>
 }
 
+// CHECK-LABEL:   func @convert_reduce_to_sum_non_constant_init(
+// CHECK-SAME:                                %[[ARG_0:.*]]: tensor<1x256xf32>,
+// CHECK-SAME:                                %[[ARG_1:.*]]: tensor<f32>) -> tensor<1xf32> {
+// CHECK-DAG:       %[[VAL_0:.*]] = "tf.Const"() {value = dense<1> : tensor<1xi64>} : () -> tensor<1xi64>
+// CHECK:           %[[VAL_1:.*]] = "tf.Sum"(%[[ARG_0]], %[[VAL_0]]) {keep_dims = false} : (tensor<1x256xf32>, tensor<1xi64>) -> tensor<1xf32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Add"(%[[VAL_1]], %[[ARG_1]]) : (tensor<1xf32>, tensor<f32>) -> tensor<1xf32>
+// CHECK:           return %[[VAL_2]] : tensor<1xf32>
+// CHECK:         }
+func @convert_reduce_to_sum_non_constant_init(%arg0: tensor<1x256xf32>, %arg1: tensor<f32>) -> tensor<1xf32> {
+  %1 = "mhlo.reduce"(%arg0, %arg1) ( {
+  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+    %2 = mhlo.add %arg2, %arg3 : tensor<f32>
+    "mhlo.return"(%2) : (tensor<f32>) -> ()
+  }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<1x256xf32>, tensor<f32>) -> tensor<1xf32>
+  return %1 : tensor<1xf32>
+}
+
 // CHECK-LABEL:   func @convert_int_reduce_to_sum(
 // CHECK-SAME:                                %[[VAL_0:.*]]: tensor<1x256xi32>) -> tensor<1xi32> {
 // CHECK-DAG:       %[[VAL_1:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
@@ -2827,6 +2874,24 @@ func @convert_reduce_to_all(%arg0: tensor<1x2x3x4x5xi1>, %arg1: tensor<2xi64>) -
   return %1: tensor<2x4x5xi1>
 }
 
+// CHECK-LABEL:   func @convert_reduce_to_all_non_constant_init(
+// CHECK-SAME:                                %[[ARG_0:.*]]: tensor<i1>,
+// CHECK-SAME:                                %[[ARG_1:.*]]: tensor<1x2x3x4x5xi1>,
+// CHECK-SAME:                                %[[ARG_2:.*]]: tensor<2xi64>) -> tensor<2x4x5xi1> {
+// CHECK-DAG:       %[[DIMENSIONS:.*]] = "tf.Const"() {value = dense<[0, 2]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_0:.*]] = "tf.All"(%[[ARG_1]], %[[DIMENSIONS]]) {keep_dims = false} : (tensor<1x2x3x4x5xi1>, tensor<2xi64>) -> tensor<2x4x5xi1>
+// CHECK:           %[[VAL_1:.*]] = "tf.LogicalAnd"(%[[VAL_0]], %[[ARG_0]]) : (tensor<2x4x5xi1>, tensor<i1>) -> tensor<2x4x5xi1>
+// CHECK:           return %[[VAL_1:.*]] : tensor<2x4x5xi1>
+// CHECK:         }
+func @convert_reduce_to_all_non_constant_init(%arg0: tensor<i1>, %arg1: tensor<1x2x3x4x5xi1>, %arg2: tensor<2xi64>) -> tensor<2x4x5xi1> {
+  %0 = "mhlo.reduce"(%arg1, %arg0) ( {
+    ^bb0(%arg3: tensor<i1>, %arg4: tensor<i1>):
+        %1 = mhlo.and %arg3, %arg4 : tensor<i1>
+        "mhlo.return"(%1) : (tensor<i1>) -> ()
+    }) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x2x3x4x5xi1>, tensor<i1>) -> tensor<2x4x5xi1>
+  return %0: tensor<2x4x5xi1>
+}
+
 // CHECK-LABEL:   func @convert_reduce_to_any(
 // CHECK-SAME:                                %[[ARG_0:.*]]: tensor<1x2x3x4x5xi1>,
 // CHECK-SAME:                                %[[ARG_1:.*]]: tensor<2xi64>) -> tensor<2x4x5xi1> {
@@ -2845,6 +2910,24 @@ func @convert_reduce_to_any(%arg0: tensor<1x2x3x4x5xi1>, %arg1: tensor<2xi64>) -
   return %1: tensor<2x4x5xi1>
 }
 
+// CHECK-LABEL:   func @convert_reduce_to_any_non_constant_init(
+// CHECK-SAME:                                %[[ARG_0:.*]]: tensor<i1>,
+// CHECK-SAME:                                %[[ARG_1:.*]]: tensor<1x2x3x4x5xi1>,
+// CHECK-SAME:                                %[[ARG_2:.*]]: tensor<2xi64>) -> tensor<2x4x5xi1> {
+// CHECK-DAG:       %[[DIMENSIONS:.*]] = "tf.Const"() {value = dense<[0, 2]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_0:.*]] = "tf.Any"(%[[ARG_1]], %[[DIMENSIONS]]) {keep_dims = false} : (tensor<1x2x3x4x5xi1>, tensor<2xi64>) -> tensor<2x4x5xi1>
+// CHECK:           %[[VAL_1:.*]] = "tf.LogicalOr"(%[[VAL_0]], %[[ARG_0]]) : (tensor<2x4x5xi1>, tensor<i1>) -> tensor<2x4x5xi1>
+// CHECK:           return %[[VAL_1:.*]] : tensor<2x4x5xi1>
+// CHECK:         }
+func @convert_reduce_to_any_non_constant_init(%arg0: tensor<i1>, %arg1: tensor<1x2x3x4x5xi1>, %arg2: tensor<2xi64>) -> tensor<2x4x5xi1> {
+  %0 = "mhlo.reduce"(%arg1, %arg0) ( {
+    ^bb0(%arg3: tensor<i1>, %arg4: tensor<i1>):
+        %1 = mhlo.or %arg3, %arg4 : tensor<i1>
+        "mhlo.return"(%1) : (tensor<i1>) -> ()
+    }) {dimensions = dense<[0, 2]> : tensor<2xi64>} : (tensor<1x2x3x4x5xi1>, tensor<i1>) -> tensor<2x4x5xi1>
+  return %0: tensor<2x4x5xi1>
+}
+
 // CHECK-LABEL:   func @convert_sort_to_topk_iota_broadcast(
 // CHECK-SAME:                                              %[[ARG_0:.*]]: tensor<3x6xf32>) -> (tensor<3x6xf32>, tensor<3x6xi32>) {
 // CHECK-DAG:       %[[VAL_0:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/legalize_tfg_with_control_flow.mlir b/tensorflow/compiler/mlir/tensorflow/tests/legalize_tfg_with_control_flow.mlir
new file mode 100644
index 00000000000000..59a32a7afffca2
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/legalize_tfg_with_control_flow.mlir
@@ -0,0 +1,26 @@
+// RUN: tf-opt -tfe-legalize-tfg %s | FileCheck %s
+
+module  {
+  tfg.graph #tf_type.version<producer = 27, min_consumer = 0> {
+    // CHECK: tf_executor.Enter
+    // CHECK: {{%.*}}, %[[TOKEN:.*]], {{%.*}} = tf_executor.NextIteration.Source
+    // CHECK: {{%.*}}, {{%.*}}, %[[CONTROL:.*]] = tf_executor.Merge
+    // CHECK: tf_executor.island(%[[CONTROL]]) wraps "tf.Const"()
+    // CHECK: tf_executor.LoopCond
+    // CHECK: tf_executor.Switch
+    // CHECK: tf_executor.Exit
+    // CHECK: tf_executor.NextIteration.Sink[%[[TOKEN]]]
+    %Const, %ctl = Const name("Const") {dtype = i32, value = dense<0> : tensor<i32>} : () -> (tensor<i32>)
+    %Enter, %ctl_0 = Enter(%Const) name("while/Enter") {T = i32, frame_name = "while/while_context", is_constant = false, parallel_iterations = 10 : i64} : (tensor<i32>) -> (tensor<*xi32>)
+    %NextIteration, %ctl_1 = NextIteration(%Add) name("while/NextIteration") {T = i32} : (tensor<*xi32>) -> (tensor<*xi32>)
+    %Merge:2, %ctl_2 = Merge(%Enter, %NextIteration) name("while/Merge") {N = 2 : i64, T = i32} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>)
+    %Const_3, %ctl_4 = Const [%ctl_2] name("while/Less/y") {dtype = i32, value = dense<10> : tensor<i32>} : () -> (tensor<i32>)
+    %Less, %ctl_5 = Less(%Merge#0, %Const_3) name("while/Less") {T = i32} : (tensor<*xi32>, tensor<i32>) -> (tensor<*xi1>)
+    %LoopCond, %ctl_6 = LoopCond(%Less) name("while/LoopCond") : (tensor<*xi1>) -> (tensor<*xi1>)
+    %Switch:2, %ctl_7 = Switch(%Merge#0, %LoopCond) name("while/Switch") {T = i32, _class = ["loc:@while/Merge"]} : (tensor<*xi32>, tensor<*xi1>) -> (tensor<*xi32>, tensor<*xi32>)
+    %Identity, %ctl_8 = Identity(%Switch#1) name("while/Identity") {T = i32} : (tensor<*xi32>) -> (tensor<*xi32>)
+    %Const_9, %ctl_10 = Const [%ctl_8] name("while/Add/y") {dtype = i32, value = dense<1> : tensor<i32>} : () -> (tensor<i32>)
+    %Add, %ctl_11 = Add(%Identity, %Const_9) name("while/Add") {T = i32} : (tensor<*xi32>, tensor<i32>) -> (tensor<*xi32>)
+    %Exit, %ctl_12 = Exit(%Switch#0) name("while/Exit") {T = i32} : (tensor<*xi32>) -> (tensor<*xi32>)
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
index 85c99a411879a1..ad6e2f4385277d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
@@ -1494,9 +1494,41 @@ func @side_effecting_ops_with_different_resources_and_allocations(
 
 // -----
 
+// Tests that we create a dependency for op instances with
+// `TPUEmbeddingSideEffect` with same device ordinal.
+func @embedding_effect_same_device(
+  // expected-remark@above {{ID: 7}}
+  %arg0: tensor<!tf_type.string>) {
+  tf_executor.graph {
+    // expected-remark@above {{ID: 5}}
+    %island = tf_executor.island {
+        // expected-remark@above {{ID: 3}}
+        // expected-remark@above {{Successors: {4}}}
+        "tf.EnqueueTPUEmbeddingRaggedTensorBatch"(%arg0) {table_ids = [1, 2], device_ordinal = 1} : (tensor<!tf_type.string>) -> ()
+        // expected-remark@above {{ID: 0}}
+        // expected-remark@above {{Successors: {1}}}
+        "tf.EnqueueTPUEmbeddingRaggedTensorBatch"(%arg0) {table_ids = [1, 2], device_ordinal = 1} : (tensor<!tf_type.string>) -> ()
+        // expected-remark@above {{ID: 1}}
+        // expected-remark@above {{Predecessors: {0}}}
+        // expected-remark@above {{Successors: {2}}}
+        tf_executor.yield
+        // expected-remark@above {{ID: 2}}
+        // expected-remark@above {{Predecessors: {1}}}
+    }
+    tf_executor.fetch %island : !tf_executor.control
+    // expected-remark@above {{ID: 4}}
+    // expected-remark@above {{Predecessors: {3}}}
+  }
+  return
+  // expected-remark@above {{ID: 6}}
+  // expected-remark@above {{Sinks: {5}}}
+}
+
+// -----
+
 // Tests that we treat different op instances with `TPUEmbeddingSideEffect` as
-// independent.
-func @embedding_effect_ops(
+// independent if they have different device ordinals.
+func @embedding_effect_different_devices(
   // expected-remark@above {{ID: 7}}
   %arg0: tensor<!tf_type.string>) {
   tf_executor.graph {
@@ -1504,10 +1536,10 @@ func @embedding_effect_ops(
     %island = tf_executor.island {
         // expected-remark@above {{ID: 3}}
         // expected-remark@above {{Successors: {4}}}
-        "tf.EnqueueTPUEmbeddingRaggedTensorBatch"(%arg0){table_ids = [1, 2]} : (tensor<!tf_type.string>) -> ()
+        "tf.EnqueueTPUEmbeddingRaggedTensorBatch"(%arg0) {table_ids = [1, 2], device_ordinal = 1} : (tensor<!tf_type.string>) -> ()
         // expected-remark@above {{ID: 0}}
         // expected-remark@above {{Successors: {2}}}
-        "tf.EnqueueTPUEmbeddingRaggedTensorBatch"(%arg0){table_ids = [1, 2]} : (tensor<!tf_type.string>) -> ()
+        "tf.EnqueueTPUEmbeddingRaggedTensorBatch"(%arg0) {table_ids = [1, 2], device_ordinal = 2} : (tensor<!tf_type.string>) -> ()
         // expected-remark@above {{ID: 1}}
         // expected-remark@above {{Successors: {2}}}
         tf_executor.yield
@@ -1561,6 +1593,42 @@ func @mixed_embedding_and_unknown_effects(
 
 // -----
 
+// Tests that we don't create dependencies between ops `EnqueueTPUEmbedding`
+ // ops and other embedding ops that don't have a device ordinal.
+func @mixed_embedding_and_unknown_effects(
+  // expected-remark@above {{ID: 8}}
+  %arg0: tensor<!tf_type.string>,
+  %arg1: tensor<8xf32>,
+  %arg2: tensor<8xf32>) {
+  tf_executor.graph {
+    // expected-remark@above {{ID: 6}}
+    %island = tf_executor.island {
+        // expected-remark@above {{ID: 4}}
+        // expected-remark@above {{Successors: {5}}}
+        "tf.EnqueueTPUEmbeddingRaggedTensorBatch"(%arg0){table_ids = [1, 2], device_ordinal = 1} : (tensor<!tf_type.string>) -> ()
+        // expected-remark@above {{ID: 0}}
+        // expected-remark@above {{Successors: {3}}}
+        "tf.LoadTPUEmbeddingAdagradParameters"(%arg1, %arg2) {config = "", num_shards = 1 : i64, shard_id = 0 : i64, table_id = -1 : i64, table_name = "table1"} : (tensor<8xf32>, tensor<8xf32>) -> ()
+        // expected-remark@above {{ID: 1}}
+        // expected-remark@above {{Successors: {3}}}
+        "tf.EnqueueTPUEmbeddingRaggedTensorBatch"(%arg0){table_ids = [1, 2], device_ordinal = 2} : (tensor<!tf_type.string>) -> ()
+        // expected-remark@above {{ID: 2}}
+        // expected-remark@above {{Successors: {3}}}
+        tf_executor.yield
+        // expected-remark@above {{ID: 3}}
+        // expected-remark@above {{Predecessors: {0,1,2}}}
+    }
+    tf_executor.fetch %island : !tf_executor.control
+    // expected-remark@above {{ID: 5}}
+    // expected-remark@above {{Predecessors: {4}}}
+  }
+  return
+  // expected-remark@above {{ID: 7}}
+  // expected-remark@above {{Sinks: {6}}}
+}
+
+// -----
+
 // Tests that we create a dependency between two ops with the same op-based
 // write effect.
 func @same_op_based_write_effect(
@@ -1602,13 +1670,13 @@ func @different_op_based_side_effects(
     %island = tf_executor.island {
         // expected-remark@above {{ID: 4}}
         // expected-remark@above {{Successors: {5}}}
-        "tf.EnqueueTPUEmbeddingRaggedTensorBatch"(%arg0){table_ids = [1, 2]} : (tensor<!tf_type.string>) -> ()
+        "tf.EnqueueTPUEmbeddingRaggedTensorBatch"(%arg0){table_ids = [1, 2], device_ordinal = 1} : (tensor<!tf_type.string>) -> ()
         // expected-remark@above {{ID: 0}}
         // expected-remark@above {{Successors: {3}}}
         %0 = "tf.GeneratorDataset"(%arg0, %arg0, %arg0) {device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0", finalize_func = @__func_a, init_func = @__func_b, next_func = @__func_c, next_func.experimental_ints_on_device = true, operand_segment_sizes = dense<[1, 1, 1]> : vector<3xi32>, output_shapes = [#tf_type.shape<>], output_types = [!tf_type.string], metadata = ""} : (tensor<!tf_type.string>, tensor<!tf_type.string>, tensor<!tf_type.string>) -> tensor<!tf_type.variant>
         // expected-remark@above {{ID: 1}}
         // expected-remark@above {{Successors: {3}}}
-        "tf.EnqueueTPUEmbeddingRaggedTensorBatch"(%arg0){table_ids = [1, 2]} : (tensor<!tf_type.string>) -> ()
+        "tf.EnqueueTPUEmbeddingRaggedTensorBatch"(%arg0){table_ids = [1, 2], device_ordinal = 5} : (tensor<!tf_type.string>) -> ()
         // expected-remark@above {{ID: 2}}
         // expected-remark@above {{Successors: {3}}}
         tf_executor.yield
@@ -1700,10 +1768,7 @@ func @send_recv_effect(
 // -----
 
 // Tests that we create a dependency between ops with
-// `TF_TPUCompileExecuteSideEffect`. Note that this test also shows a case where
-// we could improve pruning of control dependencies (see b/201013649): The
-// dependency between the first `tf.TPUExecute` and the `tf_executor.yield` is
-// redundant.
+// `TF_TPUCompileExecuteSideEffect`.
 func @tpu_compile_execute_effect(
   // expected-remark@above {{ID: 7}}
   %arg0: tensor<!tf_type.string>,
@@ -1715,14 +1780,14 @@ func @tpu_compile_execute_effect(
         // expected-remark@above {{Successors: {4}}}
         "tf.TPUExecute"(%arg0, %arg0) : (tensor<!tf_type.string>, tensor<!tf_type.string>) -> ()
         // expected-remark@above {{ID: 0}}
-        // expected-remark@above {{Successors: {1,2}}}
+        // expected-remark@above {{Successors: {1}}}
         "tf.TPUExecute"(%arg1, %arg1) : (tensor<!tf_type.string>, tensor<!tf_type.string>) -> ()
         // expected-remark@above {{ID: 1}}
         // expected-remark@above {{Predecessors: {0}}}
         // expected-remark@above {{Successors: {2}}}
         tf_executor.yield
         // expected-remark@above {{ID: 2}}
-        // expected-remark@above {{Predecessors: {0,1}}}
+        // expected-remark@above {{Predecessors: {1}}}
     }
     tf_executor.fetch %island : !tf_executor.control
     // expected-remark@above {{ID: 4}}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc b/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc
index eebbf6e2a18d75..e94e593ee8deb9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/graph_pruning.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes_detail.h"
 
@@ -118,10 +119,21 @@ void GraphPruningPass::runOnFunction() {
   getFunction().walk([this](tf_executor::GraphOp graph) { PruneGraph(graph); });
 }
 
-// An op should be preserved if its identifier is contained in
-// `ops_to_preserve_ids_`.
+// An op should be preserved if either its identifier is contained in
+// `ops_to_preserve_ids_` or if it has a `MustExecute` effect.
 bool GraphPruningPass::ShouldPreserveOp(Operation* op) {
-  return ops_to_preserve_ids_.contains(op->getName().getIdentifier());
+  if (ops_to_preserve_ids_.contains(op->getName().getIdentifier())) return true;
+
+  llvm::SmallVector<MemoryEffects::EffectInstance, 4> effects;
+  auto interface = dyn_cast<MemoryEffectOpInterface>(op);
+  if (interface) interface.getEffects(effects);
+
+  for (const auto& effect : effects) {
+    if (llvm::isa<TF::ResourceEffects::MustExecute>(effect.getResource())) {
+      return true;
+    }
+  }
+  return false;
 }
 
 // An island should be preserved if any of its inner ops should be preserved.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
index 817093cd5b2ba2..2b8f7b36520e25 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
@@ -236,6 +236,49 @@ class ConvertConvOp : public OpConversionPattern<mhlo::ConvOp> {
                                               permutation);
   }
 
+  // Slices the input `value` if there are negative padding values in
+  // `explicit_padding`.
+  Value SliceNegativePadding(Value value, ArrayRef<int64_t> explicit_padding,
+                             ConversionPatternRewriter &rewriter) const {
+    // If no padding is negative return the input as is.
+    if (llvm::all_of(explicit_padding, [](int64_t pad) { return pad >= 0; })) {
+      return value;
+    }
+
+    auto input_type = value.getType().cast<RankedTensorType>();
+    auto input_shape = input_type.getShape();
+
+    llvm::SmallVector<int64_t, 4> start;
+    llvm::SmallVector<int64_t, 4> size;
+    start.reserve(explicit_padding.size() / 2);
+    size.reserve(explicit_padding.size() / 2);
+    for (int i = 0, e = explicit_padding.size() / 2; i < e; ++i) {
+      int64_t pre_padding = explicit_padding[2 * i];
+      int64_t post_padding = explicit_padding[2 * i + 1];
+      int64_t pre_slice = pre_padding < 0 ? -pre_padding : 0;
+      int64_t post_slice = post_padding < 0 ? -post_padding : 0;
+      start.push_back(pre_slice);
+      size.push_back(input_shape[i] - pre_slice - post_slice);
+    }
+
+    auto start_attr = rewriter.create<ConstOp>(
+        value.getLoc(),
+        DenseIntElementsAttr::get(
+            RankedTensorType::get({static_cast<int64_t>(start.size())},
+                                  rewriter.getI64Type()),
+            start));
+    auto size_attr = rewriter.create<ConstOp>(
+        value.getLoc(),
+        DenseIntElementsAttr::get(
+            RankedTensorType::get({static_cast<int64_t>(size.size())},
+                                  rewriter.getI64Type()),
+            size));
+    auto output_type = RankedTensorType::get(size, input_type.getElementType());
+
+    return rewriter.create<SliceOp>(value.getLoc(), output_type, value,
+                                    start_attr, size_attr);
+  }
+
   void CreateConvOp(mhlo::ConvOp conv_op, ArrayRef<int64_t> strides,
                     StringRef padding, ArrayRef<int64_t> explicit_padding,
                     ArrayRef<int64_t> dilation, bool is_depthwise_conv,
@@ -256,6 +299,12 @@ class ConvertConvOp : public OpConversionPattern<mhlo::ConvOp> {
         /*default_feature_dim=*/num_spatial_dims + 1,
         /*default_spatial_dim_start=*/0, num_spatial_dims, rewriter);
 
+    // Emulate negative padding with a slice and remove negative values from the
+    // padding vector.
+    Value sliced_lhs = SliceNegativePadding(lhs, explicit_padding, rewriter);
+    auto new_padding = llvm::to_vector<4>(llvm::map_range(
+        explicit_padding, [](int64_t dim) { return dim > 0 ? dim : 0; }));
+
     auto conv_output_type = conv_op.getType().cast<RankedTensorType>();
     DenseIntElementsAttr permutation;
     const bool need_transpose_output = NeedsReformatTypeAndPermutation(
@@ -292,19 +341,19 @@ class ConvertConvOp : public OpConversionPattern<mhlo::ConvOp> {
           rhs);
 
       output = rewriter.create<DepthwiseConv2dNativeOp>(
-          conv_op.getLoc(), conv_output_type, lhs, reshaped_filter,
+          conv_op.getLoc(), conv_output_type, sliced_lhs, reshaped_filter,
           rewriter.getI64ArrayAttr(strides),
           /*padding=*/rewriter.getStringAttr(padding),
-          /*explicit_paddings=*/rewriter.getI64ArrayAttr(explicit_padding),
+          /*explicit_paddings=*/rewriter.getI64ArrayAttr(new_padding),
           /*data_format=*/rewriter.getStringAttr("NHWC"),
           /*dilations=*/rewriter.getI64ArrayAttr(dilation));
     } else {
       output = rewriter.create<Conv2DOp>(
-          conv_op.getLoc(), conv_output_type, lhs, rhs,
+          conv_op.getLoc(), conv_output_type, sliced_lhs, rhs,
           rewriter.getI64ArrayAttr(strides),
           /*use_cudnn_on_gpu=*/rewriter.getBoolAttr(true),
           /*padding=*/rewriter.getStringAttr(padding),
-          /*explicit_paddings=*/rewriter.getI64ArrayAttr(explicit_padding),
+          /*explicit_paddings=*/rewriter.getI64ArrayAttr(new_padding),
           /*data_format=*/rewriter.getStringAttr("NHWC"),
           /*dilations=*/rewriter.getI64ArrayAttr(dilation));
     }
@@ -1234,9 +1283,48 @@ LogicalResult MatchBinaryReduceFunction<void>(mlir::Region &function) {
   return success();
 }
 
-// Converts an mhlo.reduce op with the specified BinaryOp as the reduction
-// operation into the specified TfOp.
-template <typename BinaryOp, typename TfOp>
+// Replace BinaryOp with a combination of TfBinaryOp and TfReduceOp if the
+// init value doesn't match the expection of TfReduceOp.
+template <typename TfReduceOp, typename TfBinOp>
+LogicalResult rewriteNonMatchInitValue(mhlo::ReduceOp reduce_op, Value input,
+                                       ConstOp reduction_indices,
+                                       ConversionPatternRewriter &rewriter) {
+  Value reduce_result = rewriter.create<TfReduceOp>(
+      reduce_op.getLoc(), reduce_op.getType(0), input, reduction_indices,
+      /*keep_dim=*/rewriter.getBoolAttr(false));
+  rewriter.replaceOpWithNewOp<TfBinOp>(reduce_op, reduce_op.getType(0),
+                                       reduce_result,
+                                       reduce_op.init_values()[0]);
+  return success();
+}
+
+// Cannot replace BinaryOp if the init value doesn't match the expection of
+// TfReduceOp and there is no corresponding TfBinaryOp.
+template <>
+LogicalResult rewriteNonMatchInitValue<TF::MaxOp, void>(
+    mhlo::ReduceOp reduce_op, Value input, ConstOp reduction_indices,
+    ConversionPatternRewriter &rewriter) {
+  return failure();
+}
+
+template <>
+LogicalResult rewriteNonMatchInitValue<TF::MinOp, void>(
+    mhlo::ReduceOp reduce_op, Value input, ConstOp reduction_indices,
+    ConversionPatternRewriter &rewriter) {
+  return failure();
+}
+
+// Converts a mhlo.reduce op with a mlho binary operation into a tensorflow
+// reduction operation. If the initial value can be ignored, then convert it
+// into a single TfReduceOp. Otherwise, convert it into a TfReduceOp followed by
+// a TfBinaryOp.
+// For example:
+//   1) A mhlo::ReduceOp on value `x` with a mhlo::AndOp and a constant initial
+// value `true` is converted to a TF::Any on value `x`.
+//   2) A mhlo::ReduceOp on value `x` with a mhlo::AndOp with a non-constant
+// initial value `y` is converted to a TF::Any on value `x`, followed by a
+// TF::And with initial value `y`.
+template <typename BinaryOp, typename TfReduceOp, typename TfBinaryOp = void>
 class ConvertReduceOpToTfOp : public OpConversionPattern<mhlo::ReduceOp> {
  public:
   using OpConversionPattern::OpConversionPattern;
@@ -1249,10 +1337,6 @@ class ConvertReduceOpToTfOp : public OpConversionPattern<mhlo::ReduceOp> {
     if (failed(MatchBinaryReduceFunction<BinaryOp>(reduce_op.body())))
       return failure();
 
-    // In `MatchReduceOpInput` function, we already match that the
-    // "mhlo::ReduceOp" only has one input, one init_value and one result.
-    if (failed(MatchInitValue(reduce_op.init_values()[0]))) return failure();
-
     auto input = reduce_op.inputs()[0];
 
     // Get reduction dimension.
@@ -1266,15 +1350,25 @@ class ConvertReduceOpToTfOp : public OpConversionPattern<mhlo::ReduceOp> {
     auto reduction_indices = rewriter.create<ConstOp>(
         reduce_op.getLoc(), dim_type, rewriter.getI64TensorAttr(reduce_dims));
 
-    rewriter.replaceOpWithNewOp<TfOp>(reduce_op, reduce_op.getType(0), input,
-                                      reduction_indices,
-                                      /*keep_dim=*/rewriter.getBoolAttr(false));
-    return success();
+    // In `MatchReduceOpInput` function, we already match that the
+    // "mhlo::ReduceOp" only has one input, one init_value and one result.
+
+    // If the init value matches with the init value expected for the target
+    // TfReduceOp, then replace the BinaryOp with a TfReduceOp. Otherwise,
+    // replace the BinaryOp with a TfBinaryOp and a TfReduceOp.
+    if (succeeded(MatchInitValue(reduce_op.init_values()[0]))) {
+      rewriter.replaceOpWithNewOp<TfReduceOp>(
+          reduce_op, reduce_op.getType(0), input, reduction_indices,
+          /*keep_dim=*/rewriter.getBoolAttr(false));
+      return success();
+    }
+    return rewriteNonMatchInitValue<TfReduceOp, TfBinaryOp>(
+        reduce_op, input, reduction_indices, rewriter);
   }
 
  private:
   // Checks that the init value matches with the init value expected for the
-  // target TfOp.
+  // target TfReduceOp.
   virtual LogicalResult MatchInitValue(Value init_value) const = 0;
 
   // This function tries to match that the "mhlo::ReduceOp" only has one
@@ -1292,7 +1386,7 @@ class ConvertReduceOpToTfOp : public OpConversionPattern<mhlo::ReduceOp> {
 };
 
 class ConvertReduceOpToTfSum
-    : public ConvertReduceOpToTfOp<mhlo::AddOp, TF::SumOp> {
+    : public ConvertReduceOpToTfOp<mhlo::AddOp, TF::SumOp, TF::AddOp> {
  public:
   using ConvertReduceOpToTfOp::ConvertReduceOpToTfOp;
 
@@ -1350,9 +1444,10 @@ class ConvertReduceOpToTfMin
 };
 
 class ConvertReduceOpToTfAll
-    : public ConvertReduceOpToTfOp<mhlo::AndOp, TF::AllOp> {
+    : public ConvertReduceOpToTfOp<mhlo::AndOp, TF::AllOp, TF::LogicalAndOp> {
  public:
-  using ConvertReduceOpToTfOp<mhlo::AndOp, TF::AllOp>::ConvertReduceOpToTfOp;
+  using ConvertReduceOpToTfOp<mhlo::AndOp, TF::AllOp,
+                              TF::LogicalAndOp>::ConvertReduceOpToTfOp;
 
   LogicalResult MatchInitValue(Value init_value) const override {
     DenseIntElementsAttr init_attr;
@@ -1365,9 +1460,10 @@ class ConvertReduceOpToTfAll
 };
 
 class ConvertReduceOpToTfAny
-    : public ConvertReduceOpToTfOp<mhlo::OrOp, TF::AnyOp> {
+    : public ConvertReduceOpToTfOp<mhlo::OrOp, TF::AnyOp, TF::LogicalOrOp> {
  public:
-  using ConvertReduceOpToTfOp<mhlo::OrOp, TF::AnyOp>::ConvertReduceOpToTfOp;
+  using ConvertReduceOpToTfOp<mhlo::OrOp, TF::AnyOp,
+                              TF::LogicalOrOp>::ConvertReduceOpToTfOp;
 
   LogicalResult MatchInitValue(Value init_value) const override {
     DenseIntElementsAttr init_attr;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tfg-to-tfe.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tfg-to-tfe.cc
index 62e0782694378e..dc5212f5a3340e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tfg-to-tfe.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tfg-to-tfe.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
@@ -27,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes_detail.h"
+#include "tensorflow/core/transforms/toposort/toposort_pass.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace mlir {
@@ -97,6 +99,24 @@ static mlir::LogicalResult ReformatOpAttributes(
   return mlir::success();
 }
 
+// Split the tfg.NextIteration into tf_executor::NextIterationSourceOp and
+// tf_executor::NextIterationSinkOp to break the cycle introduced by itself.
+static void SplitNextIteration(Block &block) {
+  // TODO(b/207144333): Supports callback for unregistered ops
+  block.walk([&](Operation *op) {
+    if (!op->getName().getStringRef().equals("tfg.NextIteration")) return;
+    mlir::OpBuilder builder(op);
+    auto source_op = builder.create<tf_executor::NextIterationSourceOp>(
+        op->getLoc(), op->getOperand(0).getType());
+    builder.create<tf_executor::NextIterationSinkOp>(
+        op->getLoc(), source_op.token(), /*input=*/op->getOperand(0),
+        /*controlInputs=*/op->getOperands().drop_front());
+    op->replaceAllUsesWith(
+        ValueRange({source_op.output(), source_op.control()}));
+    op->erase();
+  });
+}
+
 class ConvertGraphOp : public OpConversionPattern<tfg::GraphOp> {
  public:
   using OpConversionPattern::OpConversionPattern;
@@ -176,13 +196,131 @@ class ConvertReturnOp : public OpConversionPattern<tfg::ReturnOp> {
   LogicalResult matchAndRewrite(
       tfg::ReturnOp ret, OpAdaptor adaptor,
       ConversionPatternRewriter &rewriter) const final {
-    // TODO(chiahungduan): Handle control attribute
     rewriter.replaceOpWithNewOp<ReturnOp>(ret.getOperation(),
                                           adaptor.getOperands());
     return success();
   }
 };
 
+class ConvertControlTriggerOp : public ConversionPattern {
+ public:
+  explicit ConvertControlTriggerOp(MLIRContext *context)
+      : ConversionPattern("tfg.ControlTrigger", PatternBenefit(1), context) {}
+
+  LogicalResult matchAndRewrite(
+      Operation *op, llvm::ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    llvm::SmallVector<Type, 2> new_types(op->getResultTypes());
+    new_types.back() = rewriter.getType<tf_executor::ControlType>();
+
+    rewriter.replaceOpWithNewOp<tf_executor::ControlTriggerOp>(
+        op, new_types, operands, op->getAttrs());
+    return success();
+  }
+};
+
+class ConvertEnterOp : public ConversionPattern {
+ public:
+  explicit ConvertEnterOp(MLIRContext *context)
+      : ConversionPattern("tfg.Enter", PatternBenefit(1), context) {}
+
+  LogicalResult matchAndRewrite(
+      Operation *op, llvm::ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    llvm::SmallVector<Type, 2> new_types(op->getResultTypes());
+    new_types.back() = rewriter.getType<tf_executor::ControlType>();
+
+    rewriter.replaceOpWithNewOp<tf_executor::EnterOp>(op, new_types, operands,
+                                                      op->getAttrs());
+    return success();
+  }
+};
+
+class ConvertExitOp : public ConversionPattern {
+ public:
+  explicit ConvertExitOp(MLIRContext *context)
+      : ConversionPattern("tfg.Exit", PatternBenefit(1), context) {}
+
+  LogicalResult matchAndRewrite(
+      Operation *op, llvm::ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    llvm::SmallVector<Type, 2> new_types(op->getResultTypes());
+    new_types.back() = rewriter.getType<tf_executor::ControlType>();
+
+    rewriter.replaceOpWithNewOp<tf_executor::ExitOp>(op, new_types, operands,
+                                                     op->getAttrs());
+    return success();
+  }
+};
+
+class ConvertLoopCondOp : public ConversionPattern {
+ public:
+  explicit ConvertLoopCondOp(MLIRContext *context)
+      : ConversionPattern("tfg.LoopCond", PatternBenefit(1), context) {}
+
+  LogicalResult matchAndRewrite(
+      Operation *op, llvm::ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    llvm::SmallVector<Type, 2> new_types(op->getResultTypes());
+    new_types.back() = rewriter.getType<tf_executor::ControlType>();
+
+    rewriter.replaceOpWithNewOp<tf_executor::LoopCondOp>(
+        op, new_types, operands, op->getAttrs());
+    return success();
+  }
+};
+
+class ConvertMergeOp : public ConversionPattern {
+ public:
+  explicit ConvertMergeOp(MLIRContext *context)
+      : ConversionPattern("tfg.Merge", PatternBenefit(1), context) {}
+
+  LogicalResult matchAndRewrite(
+      Operation *op, llvm::ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    llvm::SmallVector<Type, 2> new_types(op->getResultTypes());
+    new_types.back() = rewriter.getType<tf_executor::ControlType>();
+
+    rewriter.replaceOpWithNewOp<tf_executor::MergeOp>(op, new_types, operands,
+                                                      op->getAttrs());
+    return success();
+  }
+};
+
+class ConvertSwitchOp : public ConversionPattern {
+ public:
+  explicit ConvertSwitchOp(MLIRContext *context)
+      : ConversionPattern("tfg.Switch", PatternBenefit(1), context) {}
+
+  LogicalResult matchAndRewrite(
+      Operation *op, llvm::ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    llvm::SmallVector<Type, 2> new_types(op->getResultTypes());
+    new_types.back() = rewriter.getType<tf_executor::ControlType>();
+
+    rewriter.replaceOpWithNewOp<tf_executor::SwitchOp>(op, new_types, operands,
+                                                       op->getAttrs());
+    return success();
+  }
+};
+
+class ConvertSwitchNOp : public ConversionPattern {
+ public:
+  explicit ConvertSwitchNOp(MLIRContext *context)
+      : ConversionPattern("tfg.SwitchN", PatternBenefit(1), context) {}
+
+  LogicalResult matchAndRewrite(
+      Operation *op, llvm::ArrayRef<Value> operands,
+      ConversionPatternRewriter &rewriter) const final {
+    llvm::SmallVector<Type, 2> new_types(op->getResultTypes());
+    new_types.back() = rewriter.getType<tf_executor::ControlType>();
+
+    rewriter.replaceOpWithNewOp<tf_executor::SwitchNOp>(op, new_types, operands,
+                                                        op->getAttrs());
+    return success();
+  }
+};
+
 class ConvertGeneralOp : public ConversionPattern {
  public:
   ConvertGeneralOp(MLIRContext *context,
@@ -200,17 +338,23 @@ class ConvertGeneralOp : public ConversionPattern {
     // Update the control type from tf_type.control to tf_executor.control.
     new_types.back() = rewriter.getType<tf_executor::ControlType>();
 
-    llvm::SmallVector<Value> island_control_ops;
-
-    // TODO(chiahungduan): Handle control operands.
-    for (const auto &type : op->getOperandTypes()) {
-      if (type.isa<tfg::ControlType>())
-        return op->emitError(
-            "conversion of control operand is not supported yet");
+    // Control operand is attached on tf_executor::IslandOp.
+    llvm::SmallVector<Value> island_control_operands;
+    llvm::SmallVector<Value> inner_op_operands;
+
+    for (Value value : operands) {
+      // Because of the property of graph region, the control operands may
+      // not have been converted to tf_executor::ControlType.
+      if (value.getType().isa<tfg::ControlType>() ||
+          value.getType().isa<tf_executor::ControlType>()) {
+        island_control_operands.push_back(value);
+      } else {
+        inner_op_operands.push_back(value);
+      }
     }
 
-    auto island = rewriter.create<tf_executor::IslandOp>(loc, new_types,
-                                                         island_control_ops);
+    auto island = rewriter.create<tf_executor::IslandOp>(
+        loc, new_types, island_control_operands);
     island.body().push_back(new mlir::Block);
 
     rewriter.setInsertionPointToEnd(&island.body().front());
@@ -236,7 +380,7 @@ class ConvertGeneralOp : public ConversionPattern {
       std::string tf_op_name = llvm::formatv(
           "{0}.{1}", TF::TensorFlowDialect::getDialectNamespace(), op_name);
       OperationState state =
-          OperationState(loc, tf_op_name, operands, new_types, attrs,
+          OperationState(loc, tf_op_name, inner_op_operands, new_types, attrs,
                          op->getSuccessors(), new_regions);
       inner_op = rewriter.createOperation(state);
     } else {
@@ -246,8 +390,9 @@ class ConvertGeneralOp : public ConversionPattern {
             op->getAttrOfType<BoolAttr>("_disable_call_shape_inference")
                 .getValue();
       }
-      inner_op = rewriter.create<LegacyCallOp>(
-          loc, new_types, operands, op_name, disable_call_shape_inference);
+      inner_op =
+          rewriter.create<LegacyCallOp>(loc, new_types, inner_op_operands,
+                                        op_name, disable_call_shape_inference);
     }
 
     rewriter.create<tf_executor::YieldOp>(loc, inner_op->getResults());
@@ -296,8 +441,28 @@ void LegalizeTFGToTFE::runOnOperation() {
   patterns.insert<ConvertGraphFuncOp>(&context);
   patterns.insert<ConvertReturnOp>(&context);
   patterns.insert<ConvertGeneralOp>(&context, func_symbols);
+  // Control flow V1 operation conversion patterns.
+  patterns.insert<ConvertControlTriggerOp>(&context);
+  patterns.insert<ConvertEnterOp>(&context);
+  patterns.insert<ConvertExitOp>(&context);
+  patterns.insert<ConvertLoopCondOp>(&context);
+  patterns.insert<ConvertMergeOp>(&context);
+  patterns.insert<ConvertSwitchOp>(&context);
+  patterns.insert<ConvertSwitchNOp>(&context);
   FrozenRewritePatternSet finalPatterns(std::move(patterns));
 
+  // Turn the graph region into SSACFG region by applying an order to the
+  // operations.
+  for (auto &op : module.body().getOps()) {
+    for (auto &region : op.getRegions()) {
+      for (auto &block : region) {
+        // Split tfg.NextIteration to break the cycle.
+        SplitNextIteration(block);
+        tfg::SortTopologically(&block);
+      }
+    }
+  }
+
   // Version information is embedded in graph operation in TFG. In TFE, it's
   // embedded in the module operation.
   for (auto &op : module.body().getOps()) {
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.cc
index c77107c8de7d38..113a9062d65215 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/Verifier.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/core/ir/importexport/import.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/path.h"
@@ -78,10 +79,22 @@ Status DumpTextualIRToFile(const MlirDumpConfig& config, const Graph& graph,
     GraphImportConfig import_config;
     import_config.graph_as_function = true;
     import_config.prune_unused_nodes = false;
-    TF_ASSIGN_OR_RETURN(
-        module, ConvertGraphToMlir(graph, debug_info,
-                                   flib_def ? *flib_def : graph.flib_def(),
-                                   import_config, &context));
+    switch (config.dialect) {
+      case MlirDumpConfig::Dialect::kTFG: {
+        TF_ASSIGN_OR_RETURN(module,
+                            mlir::tfg::ImportGraphAndFunctionsToMlir(
+                                &context, graph, debug_info,
+                                flib_def ? *flib_def : graph.flib_def()));
+        break;
+      }
+      case MlirDumpConfig::Dialect::kTFExecutor: {
+        TF_ASSIGN_OR_RETURN(
+            module, ConvertGraphToMlir(graph, debug_info,
+                                       flib_def ? *flib_def : graph.flib_def(),
+                                       import_config, &context));
+        break;
+      }
+    }
     if (failed(mlir::verify(*module))) {
       return status_handler.ConsumeStatus();
     }
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.h b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.h
index b5976420231ddf..50d884db00543a 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.h
@@ -35,6 +35,13 @@ Status DumpTextualIRToFile(const MlirDumpConfig& config, const Graph& graph,
 
 // Config of the textual dump.
 struct MlirDumpConfig {
+  enum class Dialect {
+    // Tensorflow Executor Dialect
+    kTFExecutor,
+    // Tensorflow Graph Dialect
+    kTFG,
+  };
+
   // The limit of element size that gets printed.
   MlirDumpConfig& elide_large_attributes(int large_element_limit = 16) {
     this->op_printing_flags.elideLargeElementsAttrs(large_element_limit);
@@ -49,8 +56,16 @@ struct MlirDumpConfig {
     return *this;
   }
 
+  MlirDumpConfig& emit_dialect(Dialect dialect) {
+    this->dialect = dialect;
+    return *this;
+  }
+
   // Op printing flags.
   mlir::OpPrintingFlags op_printing_flags = llvm::None;
+
+  // The target MLIR dialect.
+  Dialect dialect = Dialect::kTFExecutor;
 };
 
 // Change DumpGraphToFile to dump MLIR textual IR instead of protobuf.
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph_test.cc
index f5784a8e6d184d..aef97c57afaf67 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph_test.cc
@@ -31,6 +31,11 @@ void ExpectHasSubstr(const string& s, const string& expected) {
       << "'" << s << "' does not contain '" << expected << "'";
 }
 
+void ExpectHasNoSubstr(const string& s, const string& expected) {
+  EXPECT_FALSE(absl::StrContains(s, expected))
+      << "'" << s << "' should not contain '" << expected << "'";
+}
+
 // WritableFile that simply concats into string.
 class StringWritableFile : public WritableFile {
  public:
@@ -92,5 +97,24 @@ TEST(Dump, TexualIrWithOptions) {
   ExpectHasSubstr(actual, expected_substr);
 }
 
+TEST(Dump, DumpToTFG) {
+  Graph graph(OpRegistry::Global());
+  Node* node;
+  TF_CHECK_OK(NodeBuilder("A", "NoOp").Finalize(&graph, &node));
+
+  string actual;
+  StringWritableFile file(&actual);
+
+  TF_ASSERT_OK(DumpTextualIRToFile(
+      MlirDumpConfig().emit_dialect(MlirDumpConfig::Dialect::kTFG), graph,
+      /*flib_def=*/nullptr, &file));
+
+  string expected_substr("tfg.graph");
+  ExpectHasSubstr(actual, expected_substr);
+
+  string not_expected_substr("tf_executor.island");
+  ExpectHasNoSubstr(actual, not_expected_substr);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfr/passes/rewrite_quantized_io.cc b/tensorflow/compiler/mlir/tfr/passes/rewrite_quantized_io.cc
index 7d2ab7e1b1ddf3..a2fdc4705d7fc5 100644
--- a/tensorflow/compiler/mlir/tfr/passes/rewrite_quantized_io.cc
+++ b/tensorflow/compiler/mlir/tfr/passes/rewrite_quantized_io.cc
@@ -43,8 +43,6 @@ class RewriteQuantizedIOPass
 };
 
 void RewriteQuantizedIOPass::runOnOperation() {
-  void runOnOperation();
-
   ModuleOp module = getOperation();
   OpBuilder builder(module);
   module.walk([&](FuncOp func) {
diff --git a/tensorflow/compiler/mlir/tfrt/BUILD b/tensorflow/compiler/mlir/tfrt/BUILD
index 1825f8f43a8888..ce260ee4b40f77 100644
--- a/tensorflow/compiler/mlir/tfrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/BUILD
@@ -108,6 +108,7 @@ cc_library(
         "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Async",
+        "@llvm-project//mlir:BufferizationTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgTransforms",
         "@llvm-project//mlir:MemRefDialect",
@@ -150,11 +151,13 @@ tfrt_cc_library(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tfrt/jit/transforms:tf_cpurt_passes",
         "//tensorflow/core:framework",
+        "//tensorflow/core:platform_base",
         "//tensorflow/core/platform:dynamic_annotations",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_compat_request_state",
         "//tensorflow/core/tfrt/utils:fallback_tensor",
         "@llvm-project//mlir:Async",
+        "@llvm-project//mlir:BufferizationTransforms",
         "@llvm-project//mlir:ExecutionEngine",
         "@llvm-project//mlir:ExecutionEngineUtils",
         "@llvm-project//mlir:Transforms",
@@ -607,6 +610,7 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:bridge_pass_test_pipeline_registration",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
+        "//tensorflow/compiler/mlir/tfrt:tf_cpurt_opdefs",
         "//tensorflow/compiler/mlir/tfrt/jit/transforms:tf_cpurt_passes",
         "//tensorflow/compiler/mlir/tfrt/jit/transforms:tf_cpurt_test_passes",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/BUILD b/tensorflow/compiler/mlir/tfrt/benchmarks/BUILD
index f18b463b9cc8b4..48abce8260c3e0 100644
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/BUILD
@@ -22,9 +22,9 @@ cc_library(
         "//tensorflow/core/platform:logging",
         "//third_party/eigen3",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:BufferizationTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:mlir_c_runner_utils",
         "@tf_runtime//:dtype",
         "@tf_runtime//:hostcontext",
@@ -168,7 +168,10 @@ tf_cc_binary(
     name = "transpose_op_benchmark",
     testonly = 1,
     srcs = ["transpose_op_benchmark.cc"],
-    deps = [":benchmark_mlir_function"],
+    deps = [
+        ":benchmark_mlir_function",
+        "@llvm-project//llvm:Support",
+    ],
 )
 
 cc_library(
@@ -180,12 +183,9 @@ cc_library(
 )
 
 tf_cc_binary(
-    name = "softmax_benchmark",
+    name = "softmax_op_benchmark",
     testonly = 1,
-    srcs = [
-        "softmax_benchmark.cc",
-        "softmax_benchmark.h",
-    ],
+    srcs = ["softmax_op_benchmark.cc"],
     # Args() not supported. Enable when we got rid of tf benchmark and use the
     # standard gunit benchmark.
     tags = if_oss([
@@ -194,59 +194,75 @@ tf_cc_binary(
     ]),
     deps = [
         ":benchmark",
-        "//tensorflow/core:framework_lite",
-        "//third_party/eigen3",
+        ":benchmark_mlir_function",
+        "@llvm-project//llvm:Support",
     ],
 )
 
 tf_cc_binary(
-    name = "reduction_1D_all_benchmark",
+    name = "sum_1d_op_benchmark",
     testonly = 1,
-    srcs = ["reduction_1D_all_benchmark.cc"],
+    srcs = ["sum_1d_op_benchmark.cc"],
     # Args() not supported. Enable when we got rid of tf benchmark and use the
     # standard gunit benchmark.
     tags = if_oss([
         "no_oss",
         "manual",
     ]),
-    deps = [":reduction_benchmark"],
+    deps = [
+        ":benchmark",
+        ":benchmark_mlir_function",
+        ":reduction_benchmark",
+    ],
 )
 
 tf_cc_binary(
-    name = "reduction_2D_all_benchmark",
+    name = "sum_2d_op_benchmark",
     testonly = 1,
-    srcs = ["reduction_2D_all_benchmark.cc"],
+    srcs = ["sum_2d_op_benchmark.cc"],
     # Args() not supported. Enable when we got rid of tf benchmark and use the
     # standard gunit benchmark.
     tags = if_oss([
         "no_oss",
         "manual",
     ]),
-    deps = [":reduction_benchmark"],
+    deps = [
+        ":benchmark",
+        ":benchmark_mlir_function",
+        ":reduction_benchmark",
+    ],
 )
 
 tf_cc_binary(
-    name = "reduction_2D_row_benchmark",
+    name = "sum_col_op_benchmark",
     testonly = 1,
-    srcs = ["reduction_2D_row_benchmark.cc"],
+    srcs = ["sum_col_op_benchmark.cc"],
     # Args() not supported. Enable when we got rid of tf benchmark and use the
     # standard gunit benchmark.
     tags = if_oss([
         "no_oss",
         "manual",
     ]),
-    deps = [":reduction_benchmark"],
+    deps = [
+        ":benchmark",
+        ":benchmark_mlir_function",
+        ":reduction_benchmark",
+    ],
 )
 
 tf_cc_binary(
-    name = "reduction_2D_column_benchmark",
+    name = "sum_row_op_benchmark",
     testonly = 1,
-    srcs = ["reduction_2D_column_benchmark.cc"],
+    srcs = ["sum_row_op_benchmark.cc"],
     # Args() not supported. Enable when we got rid of tf benchmark and use the
     # standard gunit benchmark.
     tags = if_oss([
         "no_oss",
         "manual",
     ]),
-    deps = [":reduction_benchmark"],
+    deps = [
+        ":benchmark",
+        ":benchmark_mlir_function",
+        ":reduction_benchmark",
+    ],
 )
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.cc
index a190b12f30bd5d..5fedf9895f4cc1 100644
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.cc
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h"
 
+#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/ExecutionEngine/CRunnerUtils.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/Support/FileUtilities.h"
-#include "mlir/Transforms/Bufferize.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "tensorflow/core/platform/logging.h"
@@ -77,7 +77,7 @@ JitExecutable& CreateJitExecutable(
       tensorflow::CreateTfCpuRtPipeline(pm, tf_cpurt_opts);
     };
   }
-  opts.type_converter = mlir::BufferizeTypeConverter();
+  opts.type_converter = mlir::bufferization::BufferizeTypeConverter();
 
   // Cache all jit executables, otherwise different benchmark runs will produce
   // different .so files and the same compiled function will have different
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.cc
index 14642666a395d2..d9bbb969db9a2e 100644
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.cc
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.h"
 
+#include <algorithm>
+#include <functional>
 #include <memory>
+#include <utility>
 
 #include "llvm/Support/SourceMgr.h"
 #include "mlir/Parser.h"  // from @llvm-project
@@ -148,6 +151,7 @@ void RunCpurtBenchmark(::testing::benchmark::State& state,
     LOG(FATAL) << "Failed to initialize call frame";
 
   for (auto _ : state) {
+    call_frame.args[0] = nullptr;  // reset kernel context argument
     (*executable)->Execute(call_frame, exec_ctx);
     if (auto err =
             (*executable)->ReturnResults(converter, exec_ctx, &call_frame))
@@ -243,7 +247,8 @@ void RunTfrtBenchmark(::testing::benchmark::State& state,
 
   // Create a HostContext for running TFRT functions. Concurrent work queue acts
   // similar to the Tensorflow `inter-op` thread pool, so we'll match the size.
-  auto host = CreateMultiThreadedHostContext(num_threads);
+  auto host = num_threads ? CreateMultiThreadedHostContext(num_threads)
+                          : CreateSingleThreadedHostContext();
   tfrt::RegisterStaticKernels(host->GetMutableRegistry());
 
   // Convert module to BEF.
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.h b/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.h
index 13cdfe5616b711..35442ad7fc0636 100644
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.h
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TFRT_BENCHMARKS_BENCHMARK_MLIR_FUNCTION_H_
 #define TENSORFLOW_COMPILER_MLIR_TFRT_BENCHMARKS_BENCHMARK_MLIR_FUNCTION_H_
 
+#include <functional>
+
 #include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h"
 
 namespace tensorflow {
@@ -59,7 +61,7 @@ void RunEigenBenchmark(
   }                                                                 \
   BENCHMARK(BM_cpurt_##NAME)->MeasureProcessCPUTime()
 
-#define BM_CpurtVectorized(NAME, MLIR_INPUT, FN, INPUT_SPEC)         \
+#define BM_CpurtV(NAME, MLIR_INPUT, FN, INPUT_SPEC)                  \
   static void BM_cpurtv_##NAME(::testing::benchmark::State& state) { \
     RunCpurtBenchmark(state, MLIR_INPUT, FN, INPUT_SPEC, true);      \
   }                                                                  \
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/compute_function_benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/compute_function_benchmark.cc
index 7e59516f0036ba..00bce0beedd0a8 100644
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/compute_function_benchmark.cc
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/compute_function_benchmark.cc
@@ -155,7 +155,7 @@ static llvm::SmallVector<InputTensorSpec> InputsFresh1() {
 }
 
 BM(Cpurt(Fresh1, mlir_fresh1, "compute", InputsFresh1()));
-BM(CpurtVectorized(Fresh1, mlir_fresh1, "compute", InputsFresh1()));
+BM(CpurtV(Fresh1, mlir_fresh1, "compute", InputsFresh1()));
 BM(Tfrt(Fresh1, mlir_fresh1, "compute", InputsFresh1()));
 
 static const char* const mlir_fresh2 = R"(
@@ -201,8 +201,113 @@ static llvm::SmallVector<InputTensorSpec> InputsFresh2() {
 }
 
 BM(Cpurt(Fresh2, mlir_fresh2, "compute", InputsFresh2()));
-BM(CpurtVectorized(Fresh2, mlir_fresh2, "compute", InputsFresh2()));
+BM(CpurtV(Fresh2, mlir_fresh2, "compute", InputsFresh2()));
 BM(Tfrt(Fresh2, mlir_fresh2, "compute", InputsFresh2()));
 
+static const char* const mlir_factorized0 = R"(
+  func @compute(%arg0: tensor<?x10xf32>) -> tensor<?x10xf32> {
+    %cst = "tf.Const"()
+          {value = dense<1.000000e+00> : tensor<f32>,
+           device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+          : () -> tensor<f32>
+    %cst_0 = "tf.Const"()
+          {value = dense<0.00508870464> : tensor<f32>,
+           device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+          : () -> tensor<f32>
+    %cst_1 = "tf.Const"()
+          {value = dense<8.52547168> : tensor<f32>,
+          device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+          : () -> tensor<f32>
+    %0 = "tf.Sub"(%arg0, %cst_1)
+          {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+          : (tensor<?x10xf32>, tensor<f32>) -> tensor<?x10xf32>
+    %1 = "tf.Mul"(%0, %cst_0)
+          {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+          : (tensor<?x10xf32>, tensor<f32>) -> tensor<?x10xf32>
+    %2 = "tf.Abs"(%1)
+          {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+          : (tensor<?x10xf32>) -> tensor<?x10xf32>
+    %3 = "tf.Minimum"(%2, %cst)
+          {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+          : (tensor<?x10xf32>, tensor<f32>) -> tensor<?x10xf32>
+    %4 = "tf.Maximum"(%cst, %2)
+          {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+          : (tensor<f32>, tensor<?x10xf32>) -> tensor<?x10xf32>
+    %5 = "tf.Log"(%4)
+          {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+          : (tensor<?x10xf32>) -> tensor<?x10xf32>
+    %6 = "tf.AddV2"(%3, %5)
+          {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+          : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<?x10xf32>
+    %7 = "tf.Sign"(%1)
+          {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+          : (tensor<?x10xf32>) -> tensor<?x10xf32>
+    %8 = "tf.Mul"(%7, %6)
+          {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+          : (tensor<?x10xf32>, tensor<?x10xf32>) -> tensor<?x10xf32>
+    return %8 : tensor<?x10xf32>
+  }
+)";
+
+static llvm::SmallVector<InputTensorSpec> InputsFactorized0() {
+  return {
+      InputTensorSpec(DT_FLOAT, {1000, 10})  // %arg0
+  };
+}
+
+BM(Cpurt(Factorized0, mlir_factorized0, "compute", InputsFactorized0()));
+BM(CpurtV(Factorized0, mlir_factorized0, "compute", InputsFactorized0()));
+BM(Tfrt(Factorized0, mlir_factorized0, "compute", InputsFactorized0()));
+
+static const char* const mlir_factorized1 = R"(
+  func @compute(%arg0: tensor<?x50xf32>,
+                %arg1: tensor<50xf32>) -> tensor<?x50xf32> {
+    %0 = "tf.BiasAdd"(%arg0, %arg1)
+         {data_format = "NHWC",
+          device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+         : (tensor<?x50xf32>, tensor<50xf32>) -> tensor<?x50xf32>
+    %1 = "tf.Relu"(%0)
+         {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+         : (tensor<?x50xf32>) -> tensor<?x50xf32>
+    return %1 : tensor<?x50xf32>
+  }
+)";
+
+static llvm::SmallVector<InputTensorSpec> InputsFactorized1() {
+  return {
+      InputTensorSpec(DT_FLOAT, {1000, 50}),  // %arg0
+      InputTensorSpec(DT_FLOAT, {50}),        // %arg1
+  };
+}
+
+BM(Cpurt(Factorized1, mlir_factorized1, "compute", InputsFactorized1()));
+BM(CpurtV(Factorized1, mlir_factorized1, "compute", InputsFactorized1()));
+BM(Tfrt(Factorized1, mlir_factorized1, "compute", InputsFactorized1()));
+
+static const char* const mlir_factorized2 = R"(
+  func @compute(%arg0: tensor<?x50x1x5xf32>,
+                %arg1: tensor<5xf32>) -> tensor<?x50x1x5xf32> {
+    %0 = "tf.BiasAdd"(%arg0, %arg1)
+         {data_format = "NHWC",
+          device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+         : (tensor<?x50x1x5xf32>, tensor<5xf32>) -> tensor<?x50x1x5xf32>
+    %1 = "tf.Relu"(%0)
+         {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+         : (tensor<?x50x1x5xf32>) -> tensor<?x50x1x5xf32>
+    return %1 : tensor<?x50x1x5xf32>
+  }
+)";
+
+static llvm::SmallVector<InputTensorSpec> InputsFactorized2() {
+  return {
+      InputTensorSpec(DT_FLOAT, {1000, 50, 1, 5}),  // %arg0
+      InputTensorSpec(DT_FLOAT, {5}),               // %arg1
+  };
+}
+
+BM(Cpurt(Factorized2, mlir_factorized2, "compute", InputsFactorized2()));
+BM(CpurtV(Factorized2, mlir_factorized2, "compute", InputsFactorized2()));
+BM(Tfrt(Factorized2, mlir_factorized2, "compute", InputsFactorized2()));
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_1D_all_benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_1D_all_benchmark.cc
deleted file mode 100644
index dc368d0e134ce3..00000000000000
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_1D_all_benchmark.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.h"
-
-namespace tensorflow {
-namespace {
-
-BM_TFMlir1(AllReduceDynamic, f32, /* num_threads */ 0,
-           MlirSpec("tf.Sum", "f32", {kDynamicDim}, /*dims_to_reduce=*/{0}));
-BM_TFMlir1(AllReduceStatic, f32, /* num_threads */ 0,
-           MlirSpec("tf.Sum", "f32", {kStaticDim}, /*dims_to_reduce=*/{0}));
-BM_Eigen1(AllReduce, f32, /* num_threads */ 0);
-
-// BM_TFMlir1(AllReduce, f32, /* num_threads */ 8,
-//           MlirSpec("tf.Sum", "f32", {kDynamicDim}, /*dims_to_reduce=*/{0}));
-// BM_Eigen1(AllReduce, f32, /* num_threads */ 8);
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_2D_all_benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_2D_all_benchmark.cc
deleted file mode 100644
index bf67a0b88750f9..00000000000000
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_2D_all_benchmark.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.h"
-
-namespace tensorflow {
-namespace {
-
-// All reduce
-BM_TFMlir2(AllReduceDynamicAll, f32, /* num_threads */ 0,
-           MlirSpec("tf.Sum", "f32", {kDynamicDim, kDynamicDim},
-                    /*dims_to_reduce=*/{0, 1}));
-BM_TFMlir2(AllReduceDynamicAll, f32, /* num_threads */ 8,
-           MlirSpec("tf.Sum", "f32", {kDynamicDim, kDynamicDim},
-                    /*dims_to_reduce=*/{0, 1}));
-// TODO(b/200348349): Re-enable after the lowering is fixed.
-// BM_TFMlir2(AllReduceStaticRow, f32, /* num_threads */ 0,
-//           MlirSpec("tf.Sum", "f32", {kStaticDim, kDynamicDim},
-//                    /*dims_to_reduce=*/{0, 1}));
-// BM_TFMlir2(AllReduceStaticCol, f32, /* num_threads */ 0,
-//           MlirSpec("tf.Sum", "f32", {kDynamicDim, kStaticDim},
-//                    /*dims_to_reduce=*/{0, 1}));
-// BM_TFMlir2(AllReduceStaticAll, f32, /* num_threads */ 0,
-//           MlirSpec("tf.Sum", "f32", {kStaticDim, kStaticDim},
-//                    /*dims_to_reduce=*/{0, 1}));
-BM_Eigen2(AllReduce, f32, /* num_threads */ 0,
-          /*output rank=*/0, EigenSpec({0, 1}));
-BM_Eigen2(AllReduce, f32, /* num_threads */ 8,
-          /*output rank=*/0, EigenSpec({0, 1}));
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_2D_column_benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_2D_column_benchmark.cc
deleted file mode 100644
index cc29193403585f..00000000000000
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_2D_column_benchmark.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.h"
-
-namespace tensorflow {
-namespace {
-
-// Column reduction
-BM_TFMlir2(ColReduceDynamicAll, f32, /* num_threads */ 0,
-           MlirSpec("tf.Sum", "f32", {kDynamicDim, kDynamicDim},
-                    /*dims_to_reduce=*/{0}));
-BM_TFMlir2(ColReduceStaticRow, f32, /* num_threads */ 0,
-           MlirSpec("tf.Sum", "f32", {kStaticDim, kDynamicDim},
-                    /*dims_to_reduce=*/{0}));
-BM_TFMlir2(ColReduceStaticCol, f32, /* num_threads */ 0,
-           MlirSpec("tf.Sum", "f32", {kDynamicDim, kStaticDim},
-                    /*dims_to_reduce=*/{0}));
-BM_TFMlir2(ColReduceStaticAll, f32, /* num_threads */ 0,
-           MlirSpec("tf.Sum", "f32", {kStaticDim, kStaticDim},
-                    /*dims_to_reduce=*/{0}));
-BM_Eigen2(ColReduce, f32, /* num_threads */ 0, /*output rank=*/1,
-          EigenSpec({0}));
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_2D_row_benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_2D_row_benchmark.cc
deleted file mode 100644
index 33b0166e8b1c58..00000000000000
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_2D_row_benchmark.cc
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.h"
-
-namespace tensorflow {
-namespace {
-
-// Row reduction
-BM_TFMlir2(RowReduceDynamicAll, f32, /* num_threads */ 0,
-           MlirSpec("tf.Sum", "f32", {kDynamicDim, kDynamicDim},
-                    /*dims_to_reduce=*/{1}));
-BM_TFMlir2(RowReduceStaticRow, f32, /* num_threads */ 0,
-           MlirSpec("tf.Sum", "f32", {kStaticDim, kDynamicDim},
-                    /*dims_to_reduce=*/{1}));
-BM_TFMlir2(RowReduceStaticCol, f32, /* num_threads */ 0,
-           MlirSpec("tf.Sum", "f32", {kDynamicDim, kStaticDim},
-                    /*dims_to_reduce=*/{1}));
-BM_TFMlir2(RowReduceStaticAll, f32, /* num_threads */ 0,
-           MlirSpec("tf.Sum", "f32", {kStaticDim, kStaticDim},
-                    /*dims_to_reduce=*/{1}));
-BM_Eigen2(RowReduce, f32, /* num_threads */ 0, /*output rank=*/1,
-          EigenSpec({1}));
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.cc
index 6c44f8a4b0c253..0a1809593b6a65 100644
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.cc
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.cc
@@ -18,27 +18,54 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h"
 
 namespace tensorflow {
+namespace {
+
+using ::llvm::ArrayRef;
+using ::llvm::SmallVector;
+using ::llvm::StringRef;
 
 static const char* kReductionIR = R"(
   func @main(%input: {1}) -> {2} {
-    %dim_to_reduce = "tf.Const"() {{value = {3} : {4}} : () -> {4}
-    %result = "{0}"(%input, %dim_to_reduce) {{keep_dims = false}
-      : ({1}, {4}) -> {2}
+    %dim_to_reduce = "tf.Const"() {{
+      value = {3} : {4},
+      device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    } : () -> {4}
+    %result = "{0}"(%input, %dim_to_reduce) {{
+      keep_dims = false,
+      device = "/job:localhost/replica:0/task:0/device:CPU:0"
+    } : ({1}, {4}) -> {2}
     return %result : {2}
   }
 )";
 
-std::string GetIR(StringRef op_name, ArrayRef<int64_t> input_shape,
-                  ArrayRef<int64_t> output_shape,
-                  ArrayRef<int32_t> dims_to_reduce, StringRef element_type) {
+std::string GetReductionIR(StringRef op_name,
+                           ArrayRef<int64_t> mlir_input_shape,
+                           ArrayRef<int64_t> mlir_output_shape,
+                           ArrayRef<int32_t> dims_to_reduce,
+                           StringRef element_type) {
   return llvm::formatv(
-      kReductionIR, op_name,                        // TF op to use {0},
-      PrintTensorType(input_shape, element_type),   // Input type {1}
-      PrintTensorType(output_shape, element_type),  // Output type {2}
-      PrintDenseArray(dims_to_reduce),              // Dims to reduce attr {3}
+      kReductionIR, op_name,                             // TF op to use {0},
+      PrintTensorType(mlir_input_shape, element_type),   // Input type {1}
+      PrintTensorType(mlir_output_shape, element_type),  // Output type {2}
+      PrintDenseArray(dims_to_reduce),  // Dims to reduce attr {3}
       PrintTensorType(static_cast<int64_t>(dims_to_reduce.size()),
                       "i32")  // Dims to reduce type {4}
   );
 }
 
+}  // namespace
+
+std::string GetTFSumIR(ArrayRef<int32_t> input_shape,
+                       ArrayRef<bool> dynamic_dims,
+                       ArrayRef<int32_t> dims_to_reduce) {
+  SmallVector<int64_t, 2> mlir_input_shape, mlir_output_shape;
+  for (int i = 0; i < input_shape.size(); ++i) {
+    mlir_input_shape.push_back(dynamic_dims[i] ? kDynSize : input_shape[i]);
+    if (llvm::find(dims_to_reduce, i) == dims_to_reduce.end())
+      mlir_output_shape.push_back(mlir_input_shape[i]);
+  }
+  return GetReductionIR("tf.Sum", mlir_input_shape, mlir_output_shape,
+                        dims_to_reduce, "f32");
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.h b/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.h
index 07a2bfb734dd0a..c6a0da2c9288c1 100644
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.h
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.h
@@ -20,239 +20,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-// Use type aliases compatible with MLIR type names.
-using f32 = float;
-
-// This header is a part of the library with private visibility and will be
-// used only to build benchmarks for different functions in this folder, so
-// it is ok to put convenience using-declarations here.
-
-using ::llvm::ArrayRef;
-using ::llvm::SmallVector;
-using ::llvm::StringRef;
-using ::tfrt::AsyncValue;
-using ::tfrt::AsyncValuePtr;
-using ::tfrt::HostContext;
-using ::tfrt::RCReference;
-using ::tfrt::RemainingResults;
-using ::tfrt::RequestContext;
-using ::tfrt::RequestContextBuilder;
-using ::tfrt::cpu::jit::Executable;
-using ::tfrt::cpu::jit::JitExecutable;
-using ::tfrt::cpu::jit::MemrefDesc;
-using ::tfrt::cpu::jit::ReturnValueConverter;
-
-// -------------------------------------------------------------------------- //
-// Run benchmark by compiling MLIR function using TFRT CPURT API.
-// -------------------------------------------------------------------------- //
-
-struct MlirSpec {
-  MlirSpec(StringRef op_name, StringRef element_type,
-           SmallVector<bool, 2> input_dynamic,
-           SmallVector<int32_t, 2> dims_to_reduce)
-      : op_name(op_name),
-        element_type(element_type),
-        input_dynamic(std::move(input_dynamic)),
-        dims_to_reduce(std::move(dims_to_reduce)) {}
-  StringRef op_name;
-  StringRef element_type;
-  SmallVector<bool, 2> input_dynamic;
-  SmallVector<int32, 2> dims_to_reduce;
-};
-
-std::string GetIR(StringRef op_name, ArrayRef<int64_t> input_shape,
-                  ArrayRef<int64_t> output_shape,
-                  ArrayRef<int32_t> dims_to_reduce, StringRef element_type);
-
-template <typename T, int INPUT_RANK>
-void RunReductionMlirBenchmark(::testing::benchmark::State& state,
-                               size_t num_threads, const MlirSpec& spec) {
-  // Input and output shapes to generate IR.
-  SmallVector<int64_t, 2> mlir_input_shape, mlir_output_shape;
-
-  // Compute input/output shapes and the number of elements.
-  std::array<ssize_t, INPUT_RANK> input_shape;
-  int64_t num_elements = 1;
-  for (int i = 0; i < INPUT_RANK; ++i) {
-    input_shape[i] = state.range(i);
-    num_elements *= state.range(i);
-    mlir_input_shape.push_back(spec.input_dynamic[i] ? kDynSize
-                                                     : state.range(i));
-    if (llvm::find(spec.dims_to_reduce, i) == spec.dims_to_reduce.end())
-      mlir_output_shape.push_back(mlir_input_shape[i]);
-  }
-
-  std::unique_ptr<HostContext> host =
-      num_threads > 0 ? CreateMultiThreadedHostContext(num_threads)
-                      : CreateSingleThreadedHostContext();
-
-  // Compile JIT executable.
-  auto mlir_input = GetIR(spec.op_name, mlir_input_shape, mlir_output_shape,
-                          spec.dims_to_reduce, spec.element_type);
-  TfCpuRtPipelineOptions tf_cpurt_opts;
-  tf_cpurt_opts.vectorize = true;
-  JitExecutable& jit_executable =
-      CreateJitExecutable(*host, mlir_input, "main",
-                          /*lower_from_tensorflow=*/true, tf_cpurt_opts);
-
-  // Build an ExecutionContext from the HostContext.
-  llvm::Expected<RCReference<RequestContext>> req_ctx =
-      RequestContextBuilder(host.get(), /*resource_context=*/nullptr).build();
-  tfrt::ExecutionContext exec_ctx(std::move(*req_ctx));
-
-  // Generate random input data.
-  Eigen::Tensor<T, INPUT_RANK, Eigen::RowMajor> input =
-      GenRandomTensor<T, INPUT_RANK>(input_shape);
-
-  std::array<MemrefDesc, 1> operands = {TensorToMemrefDesc(input)};
-
-  auto result_values = std::array<RCReference<AsyncValue>, 2>{{}};
-  RemainingResults results(result_values);
-
-  // Free memory owned by the returned memrefs.
-  ReturnValueConverter<ResultConversionCtx> converter(results);
-  converter.AddConversion(FreeReturnedMemref);
-
-  // Get an executable that might be specialized to the operands.
-  llvm::Expected<AsyncValuePtr<Executable>> executable =
-      jit_executable.GetExecutable(operands, exec_ctx);
-  if (auto err = executable.takeError())
-    LOG(FATAL) << "Failed to specialize executable";
-
-  // Wait for the compilation completion.
-  host->Await({executable->CopyRef()});
-
-  CHECK(!executable->IsError())
-      << "Failed to get executable: " << StrCat(executable->GetError());
-  CHECK(!(*executable)->IsAsync()) << "async results are not supported";
-
-  // Initialize call frame with MemrefDesc operands.
-  Executable::CallFrame call_frame;
-  if (auto err = (*executable)->InitializeCallFrame(operands, &call_frame))
-    LOG(FATAL) << "Failed to initialize call frame";
-
-  for (auto s : state) {
-    (*executable)->Execute(call_frame, exec_ctx);
-    if (auto err =
-            (*executable)->ReturnResults(converter, exec_ctx, &call_frame))
-      LOG(FATAL) << "Failed to return compiled kernel results";
-  }
-
-  state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
-                          num_elements);
-}
-
-// -------------------------------------------------------------------------- //
-// Run benchmark using Eigen expression evaluation.
-// -------------------------------------------------------------------------- //
-
-struct EigenSpec {
-  explicit EigenSpec(SmallVector<int32_t, 2> dims_to_reduce)
-      : dims_to_reduce(std::move(dims_to_reduce)) {}
-  SmallVector<int32_t, 2> dims_to_reduce;
-  size_t num_threads;
-};
-
-template <typename T, int INPUT_RANK, int OUTPUT_RANK>
-void RunReductionEigenBenchmark(::testing::benchmark::State& state,
-                                size_t num_threads, const EigenSpec& spec) {
-  std::array<ssize_t, INPUT_RANK - OUTPUT_RANK> dims_to_reduce;
-  for (int i = 0; i < dims_to_reduce.size(); ++i) {
-    dims_to_reduce[i] = spec.dims_to_reduce[i];
-  }
-
-  // Compute input/output shapes and the number of elements.
-  std::array<ssize_t, INPUT_RANK> input_shape;
-  std::array<ssize_t, OUTPUT_RANK> output_shape;
-  int64_t num_elements = 1;
-  for (int i = 0, j = 0; i < INPUT_RANK; ++i) {
-    input_shape[i] = state.range(i);
-    num_elements *= state.range(i);
-    if (llvm::find(spec.dims_to_reduce, i) == spec.dims_to_reduce.end())
-      output_shape[j++] = input_shape[i];
-  }
-
-  Eigen::Tensor<T, INPUT_RANK, Eigen::RowMajor> lhs =
-      GenRandomTensor<T, INPUT_RANK>(input_shape);
-
-  Eigen::DefaultDevice single_threaded_device;
-  Eigen::ThreadPool thread_pool(num_threads);
-  llvm::Optional<Eigen::ThreadPoolDevice> multi_threaded_device;
-  if (num_threads > 0) multi_threaded_device.emplace(&thread_pool, num_threads);
-
-  auto dst = InitEigenTensor<T, OUTPUT_RANK>::Get(output_shape);
-  dst.setZero();
-
-  for (auto s : state) {
-    auto expr = lhs.sum(dims_to_reduce);
-
-    using Dst = decltype(dst);
-    using Expr = decltype(expr);
-    if (multi_threaded_device.hasValue()) {
-      ExecuteAssignOp</*vectorize=*/true, Eigen::ThreadPoolDevice, Dst,
-                      Expr>::run(*multi_threaded_device, dst, expr);
-    } else {
-      ExecuteAssignOp</*vectorize=*/true, Eigen::DefaultDevice, Dst, Expr>::run(
-          single_threaded_device, dst, expr);
-    }
-  }
-
-  state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) *
-                          num_elements);
-}
-
-// -------------------------------------------------------------------------- //
-// Macros to dispatch to different shapes.
-// -------------------------------------------------------------------------- //
-
-// MLIR benchmarks
-#define BM_TFMlir(NAME, TYPE, NUM_THREADS, INPUT_RANK, SPEC)               \
-  static void BM_mlir__##INPUT_RANK##D_##NAME##_##TYPE##_##NUM_THREADS(    \
-      ::testing::benchmark::State& state) {                                \
-    RunReductionMlirBenchmark<TYPE, INPUT_RANK>(state, NUM_THREADS, SPEC); \
-  }                                                                        \
-  BENCHMARK(BM_mlir__##INPUT_RANK##D_##NAME##_##TYPE##_##NUM_THREADS)      \
-      ->MeasureProcessCPUTime()
-
-#define ARGS_1D         \
-  Args({3})             \
-      ->Args({8})       \
-      ->Args({80})      \
-      ->Args({800})     \
-      ->Args({8000})    \
-      ->Args({8131})    \
-      ->Args({1000000}) \
-      ->Args({1010131})
-
-#define BM_TFMlir1(NAME, TYPE, NUM_THREADS, SPEC) \
-  BM_TFMlir(NAME, TYPE, NUM_THREADS, 1, SPEC)->ARGS_1D
-
-#define ARGS_2D          \
-  Args({2, 80})          \
-      ->Args({8, 6})     \
-      ->Args({80, 1})    \
-      ->Args({80, 60})   \
-      ->Args({81, 61})   \
-      ->Args({800, 600}) \
-      ->Args({802, 602})
-#define BM_TFMlir2(NAME, TYPE, NUM_THREADS, SPEC) \
-  BM_TFMlir(NAME, TYPE, NUM_THREADS, 2, SPEC)->ARGS_2D
-
-// Eigen benchmarks
-#define BM_Eigen(NAME, TYPE, NUM_THREADS, INPUT_RANK, OUTPUT_RANK, SPEC) \
-  static void BM_eigen_##INPUT_RANK##D_##NAME##_##TYPE##_##NUM_THREADS(  \
-      ::testing::benchmark::State& state) {                              \
-    RunReductionEigenBenchmark<TYPE, INPUT_RANK, OUTPUT_RANK>(           \
-        state, NUM_THREADS, SPEC);                                       \
-  }                                                                      \
-  BENCHMARK(BM_eigen_##INPUT_RANK##D_##NAME##_##TYPE##_##NUM_THREADS)    \
-      ->MeasureProcessCPUTime()
-
-#define BM_Eigen1(NAME, TYPE, NUM_THREADS) \
-  BM_Eigen(NAME, TYPE, NUM_THREADS, 1, 0, EigenSpec({0}))->ARGS_1D
-
-#define BM_Eigen2(NAME, TYPE, NUM_THREADS, OUTPUT_RANK, SPEC) \
-  BM_Eigen(NAME, TYPE, NUM_THREADS, 2, OUTPUT_RANK, SPEC)->ARGS_2D
+std::string GetTFSumIR(llvm::ArrayRef<int32_t> input_shape,
+                       llvm::ArrayRef<bool> dynamic_dims,
+                       llvm::ArrayRef<int32_t> dims_to_reduce);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/softmax_benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/softmax_benchmark.cc
deleted file mode 100644
index 56d6415cb77a1b..00000000000000
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/softmax_benchmark.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/tfrt/benchmarks/softmax_benchmark.h"
-
-#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h"
-
-namespace tensorflow {
-
-static const char* kReductionIR = R"(
-  func @main(%input: {0}) -> {0} {
-    %result = "tf.Softmax"(%input) : ({0}) -> {0}
-    return %result : {0}
-  }
-)";
-
-std::string GetSoftMaxIR(ArrayRef<int64_t> shape, StringRef element_type) {
-  return llvm::formatv(kReductionIR, PrintTensorType(shape, element_type));
-}
-
-namespace {
-
-#define ARGS_2D          \
-  Args({2, 80})          \
-      ->Args({8, 6})     \
-      ->Args({80, 1})    \
-      ->Args({80, 60})   \
-      ->Args({81, 61})   \
-      ->Args({800, 600}) \
-      ->Args({802, 602})
-
-BM_TFMlir2_SingleThread(Softmax2DDynamicAll, f32,
-                        MlirSpec("f32", {kDynamicDim, kDynamicDim}))
-    ->ARGS_2D;
-BM_TFMlir2_SingleThread(Softmax2DRowStatic, f32,
-                        MlirSpec("f32", {kStaticDim, kDynamicDim}))
-    ->ARGS_2D;
-BM_TFMlir2_SingleThread(Softmax2DColStatic, f32,
-                        MlirSpec("f32", {kDynamicDim, kStaticDim}))
-    ->ARGS_2D;
-BM_TFMlir2_SingleThread(Softmax2DStaticAll, f32,
-                        MlirSpec("f32", {kStaticDim, kStaticDim}))
-    ->ARGS_2D;
-BM_Eigen2_SingleThread(Softmax2D, f32)->ARGS_2D;
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/softmax_benchmark.h b/tensorflow/compiler/mlir/tfrt/benchmarks/softmax_benchmark.h
deleted file mode 100644
index 96c3c310cefef1..00000000000000
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/softmax_benchmark.h
+++ /dev/null
@@ -1,258 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_BENCHMARKS_SOFTMAX_BENCHMARK_H_
-#define TENSORFLOW_COMPILER_MLIR_TFRT_BENCHMARKS_SOFTMAX_BENCHMARK_H_
-
-#include <string>
-#include <utility>
-
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h"
-#include "tensorflow/core/framework/tensor_types.h"
-
-namespace tensorflow {
-
-// Use type aliases compatible with MLIR type names.
-using f32 = float;
-
-// This header is a part of the library with private visibility and will be
-// used only to build benchmarks for different functions in this folder, so
-// it is ok to put convenience using-declarations here.
-//
-using ::llvm::ArrayRef;
-using ::llvm::SmallVector;
-using ::llvm::StringRef;
-using ::tfrt::AsyncValue;
-using ::tfrt::AsyncValuePtr;
-using ::tfrt::HostContext;
-using ::tfrt::RCReference;
-using ::tfrt::RemainingResults;
-using ::tfrt::RequestContext;
-using ::tfrt::RequestContextBuilder;
-using ::tfrt::cpu::jit::Executable;
-using ::tfrt::cpu::jit::JitExecutable;
-using ::tfrt::cpu::jit::MemrefDesc;
-using ::tfrt::cpu::jit::ReturnValueConverter;
-
-// -------------------------------------------------------------------------- //
-// Run benchmark by compiling MLIR function using TFRT CPURT API.
-// -------------------------------------------------------------------------- //
-
-struct MlirSpec {
-  MlirSpec(StringRef element_type, SmallVector<bool, 2> input_dynamic)
-      : element_type(element_type), input_dynamic(std::move(input_dynamic)) {}
-  StringRef element_type;
-  SmallVector<bool, 2> input_dynamic;
-};
-
-std::string GetSoftMaxIR(ArrayRef<int64_t> shape, StringRef element_type);
-
-template <typename T, int INPUT_RANK>
-void RunSoftmaxMlirBenchmark(::testing::benchmark::State& state,
-                             size_t num_threads, const MlirSpec& spec) {
-  // Input and output shapes to generate IR.
-  SmallVector<int64_t, 2> mlir_input_shape, mlir_output_shape;
-
-  // Compute input/output shapes and the number of elements.
-  std::array<ssize_t, INPUT_RANK> input_shape;
-  int64_t num_elements = 1;
-  for (int i = 0; i < INPUT_RANK; ++i) {
-    input_shape[i] = state.range(i);
-    num_elements *= state.range(i);
-    mlir_input_shape.push_back(spec.input_dynamic[i] ? kDynSize
-                                                     : state.range(i));
-  }
-
-  std::unique_ptr<HostContext> host =
-      num_threads > 0 ? CreateMultiThreadedHostContext(num_threads)
-                      : CreateSingleThreadedHostContext();
-
-  // Compile JIT executable.
-  auto mlir_input = GetSoftMaxIR(input_shape, spec.element_type);
-  TfCpuRtPipelineOptions tf_cpurt_opts;
-  tf_cpurt_opts.vectorize = true;
-  JitExecutable& jit_executable =
-      CreateJitExecutable(*host, mlir_input, "main",
-                          /*lower_from_tensorflow=*/true, tf_cpurt_opts);
-
-  // Build an ExecutionContext from the HostContext.
-  llvm::Expected<RCReference<RequestContext>> req_ctx =
-      RequestContextBuilder(host.get(), /*resource_context=*/nullptr).build();
-  tfrt::ExecutionContext exec_ctx(std::move(*req_ctx));
-
-  // Generate random input data.
-  Eigen::Tensor<T, INPUT_RANK, Eigen::RowMajor> input =
-      GenRandomTensor<T, INPUT_RANK>(input_shape);
-
-  std::array<MemrefDesc, 1> operands = {TensorToMemrefDesc(input)};
-
-  auto result_values = std::array<RCReference<AsyncValue>, 2>{{}};
-  RemainingResults results(result_values);
-
-  // Free memory owned by the returned memrefs.
-  ReturnValueConverter<ResultConversionCtx> converter(results);
-  converter.AddConversion(FreeReturnedMemref);
-
-  // Get an executable that might be specialized to the operands.
-  llvm::Expected<AsyncValuePtr<Executable>> executable =
-      jit_executable.GetExecutable(operands, exec_ctx);
-  if (auto err = executable.takeError())
-    LOG(FATAL) << "Failed to specialize executable";
-
-  // Wait for the compilation completion.
-  host->Await({executable->CopyRef()});
-
-  CHECK(!executable->IsError())
-      << "Failed to get executable: " << StrCat(executable->GetError());
-  CHECK(!(*executable)->IsAsync()) << "async results are not supported";
-
-  // Initialize call frame with MemrefDesc operands.
-  Executable::CallFrame call_frame;
-  if (auto err = (*executable)->InitializeCallFrame(operands, &call_frame))
-    LOG(FATAL) << "Failed to initialize call frame";
-
-  for (auto s : state) {
-    (*executable)->Execute(call_frame, exec_ctx);
-    if (auto err =
-            (*executable)->ReturnResults(converter, exec_ctx, &call_frame))
-      LOG(FATAL) << "Failed to return compiled kernel results";
-  }
-
-  state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) * 6 *
-                          num_elements);
-}
-
-// -------------------------------------------------------------------------- //
-// Run benchmark using Eigen expression evaluation.
-// -------------------------------------------------------------------------- //
-
-// Eigen code implementing SoftmaxFunctor::operator() carefully taken from
-// tensorflow/core/kernels/softmax_op_functor.h
-template <typename Device, typename T>
-struct SoftmaxEigenImpl {
-  static void Compute(const Device& d, T logits, T softmax) {
-    const int kBatchDim = 0;
-    const int kClassDim = 1;
-
-    const int batch_size = logits.dimension(kBatchDim);
-    const int num_classes = logits.dimension(kClassDim);
-
-// These arrays are used to reduce along the class dimension, and broadcast
-// the resulting value to all classes.
-#if !defined(EIGEN_HAS_INDEX_LIST)
-    Eigen::DSizes<int, 1> along_class(kClassDim);
-    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
-#else
-    Eigen::IndexList<Eigen::type2index<kClassDim>> along_class;
-    Eigen::IndexList<int, Eigen::type2index<1>> batch_by_one;
-    batch_by_one.set(0, batch_size);
-    Eigen::IndexList<Eigen::type2index<1>, int> one_by_class;
-    one_by_class.set(1, num_classes);
-#endif
-    // shifted_logits = logits - max(logits along classes);
-    auto shifted_logits = (logits - logits.maximum(along_class)
-                                        .eval()
-                                        .reshape(batch_by_one)
-                                        .broadcast(one_by_class));
-    softmax.device(d) = shifted_logits.exp();
-    softmax.device(d) = (softmax * softmax.sum(along_class)
-                                       .inverse()
-                                       .eval()
-                                       .reshape(batch_by_one)
-                                       .broadcast(one_by_class));
-  }
-};
-
-// Functor used by SoftmaxOp to do the computations.
-template <typename Device, typename T>
-struct SoftmaxFunctor {
-  // Computes Softmax or LogSoftmax activation.
-  //
-  // logits: dim: batch_size, num_classes.
-  // softmax: dims: batch_size, num_classes.
-  // log: boolean
-  void operator()(const Device& d, T logits, T softmax) {
-    SoftmaxEigenImpl<Device, T>::Compute(d, logits, softmax);
-  }
-};
-template <typename T, int RANK>
-void RunSoftmaxEigenBenchmark(::testing::benchmark::State& state,
-                              size_t num_threads) {
-  // Compute input/output shapes and the number of elements.
-  std::array<ssize_t, RANK> input_shape;
-  int64_t num_elements = 1;
-  for (int i = 0; i < RANK; ++i) {
-    input_shape[i] = state.range(i);
-    num_elements *= state.range(i);
-  }
-
-  Eigen::Tensor<T, RANK, Eigen::RowMajor> input =
-      GenRandomTensor<T, RANK>(input_shape);
-
-  Eigen::DefaultDevice single_threaded_device;
-  Eigen::ThreadPool thread_pool(num_threads);
-  llvm::Optional<Eigen::ThreadPoolDevice> multi_threaded_device;
-  if (num_threads > 0) multi_threaded_device.emplace(&thread_pool, num_threads);
-
-  auto dst = InitEigenTensor<T, RANK>::Get(input_shape);
-  dst.setZero();
-
-  for (auto s : state) {
-    using Dst = decltype(dst);
-
-    SoftmaxFunctor<Eigen::DefaultDevice, Dst> functor;
-    functor(single_threaded_device, input, dst);
-  }
-
-  state.SetItemsProcessed(static_cast<int64_t>(state.iterations()) * 6 *
-                          num_elements);
-}
-
-// -------------------------------------------------------------------------- //
-// Macros to dispatch to different shapes.
-// -------------------------------------------------------------------------- //
-
-// MLIR benchmarks
-#define BM_TFMlir(NAME, TYPE, NUM_THREADS, INPUT_RANK, SPEC)             \
-  static void BM_mlir__##INPUT_RANK##D_##NAME##_##TYPE##_##NUM_THREADS(  \
-      ::testing::benchmark::State& state) {                              \
-    RunSoftmaxMlirBenchmark<TYPE, INPUT_RANK>(state, NUM_THREADS, SPEC); \
-  }                                                                      \
-  BENCHMARK(BM_mlir__##INPUT_RANK##D_##NAME##_##TYPE##_##NUM_THREADS)    \
-      ->MeasureProcessCPUTime()
-
-#define BM_TFMlir2(NAME, TYPE, NUM_THREADS, SPEC) \
-  BM_TFMlir(NAME, TYPE, NUM_THREADS, 2, SPEC)
-#define BM_TFMlir2_SingleThread(NAME, TYPE, SPEC) \
-  BM_TFMlir(NAME, TYPE, 0, 2, SPEC)
-
-// Eigen benchmarks
-#define BM_Eigen(NAME, TYPE, NUM_THREADS, RANK)                   \
-  static void BM_eigen_##RANK##D_##NAME##_##TYPE##_##NUM_THREADS( \
-      ::testing::benchmark::State& state) {                       \
-    RunSoftmaxEigenBenchmark<TYPE, RANK>(state, NUM_THREADS);     \
-  }                                                               \
-  BENCHMARK(BM_eigen_##RANK##D_##NAME##_##TYPE##_##NUM_THREADS)   \
-      ->MeasureProcessCPUTime()
-
-#define BM_Eigen2(NAME, TYPE, NUM_THREADS) BM_Eigen(NAME, TYPE, NUM_THREADS, 2)
-
-#define BM_Eigen2_SingleThread(NAME, TYPE) BM_Eigen2(NAME, TYPE, 0)
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_BENCHMARKS_SOFTMAX_BENCHMARK_H_
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/softmax_op_benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/softmax_op_benchmark.cc
new file mode 100644
index 00000000000000..b891e9e9147e03
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/softmax_op_benchmark.cc
@@ -0,0 +1,150 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "llvm/Support/FormatVariadic.h"
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h"
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.h"
+
+namespace tensorflow {
+namespace {
+
+const char* kSoftmaxIR = R"(
+  func @main(%input: {0}) -> {0} {
+    %result = "tf.Softmax"(%input)
+      {{device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+      : ({0}) -> {0}
+    return %result : {0}
+  }
+)";
+
+std::string Softmax(llvm::ArrayRef<bool> dynamic_dims,
+                    llvm::ArrayRef<ssize_t> input_shape) {
+  llvm::SmallVector<int64_t, 2> mlir_input_shape;
+  for (int i = 0; i < input_shape.size(); ++i) {
+    mlir_input_shape.push_back(dynamic_dims[i] ? kDynSize : input_shape[i]);
+  }
+  return llvm::formatv(kSoftmaxIR, PrintTensorType(mlir_input_shape, "f32"));
+}
+
+// Eigen code implementing SoftmaxFunctor::operator() carefully taken from
+// tensorflow/core/kernels/softmax_op_functor.h
+template <typename InT, typename OutT>
+static void ComputeSoftmax(const Eigen::DefaultDevice& d, InT logits,
+                           OutT softmax) {
+  const int kBatchDim = 0;
+  const int kClassDim = 1;
+
+  const int batch_size = logits.dimension(kBatchDim);
+  const int num_classes = logits.dimension(kClassDim);
+
+// These arrays are used to reduce along the class dimension, and broadcast
+// the resulting value to all classes.
+#if !defined(EIGEN_HAS_INDEX_LIST)
+  Eigen::DSizes<int, 1> along_class(kClassDim);
+  Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+  Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+#else
+  Eigen::IndexList<Eigen::type2index<kClassDim> > along_class;
+  Eigen::IndexList<int, Eigen::type2index<1> > batch_by_one;
+  batch_by_one.set(0, batch_size);
+  Eigen::IndexList<Eigen::type2index<1>, int> one_by_class;
+  one_by_class.set(1, num_classes);
+#endif
+  // shifted_logits = logits - max(logits along classes);
+  auto shifted_logits = (logits - logits.maximum(along_class)
+                                      .eval()
+                                      .reshape(batch_by_one)
+                                      .broadcast(one_by_class));
+  softmax.device(d) = shifted_logits.exp();
+  softmax.device(d) = (softmax * softmax.sum(along_class)
+                                     .inverse()
+                                     .eval()
+                                     .reshape(batch_by_one)
+                                     .broadcast(one_by_class));
+}
+
+auto EigenSoftmax() {
+  return [](llvm::ArrayRef<Tensor> inputs,
+            llvm::Optional<Eigen::ThreadPoolDevice>) {
+    Tensor output(DT_FLOAT, {inputs[0].dim_size(0), inputs[0].dim_size(1)});
+
+    auto in = inputs[0].tensor<float, 2>();
+    auto out = output.tensor<float, 2>();
+    out.setZero();
+
+    Eigen::DefaultDevice default_device;
+    ComputeSoftmax<decltype(in), decltype(out)>(default_device, in, out);
+  };
+}
+
+llvm::SmallVector<InputTensorSpec> Inputs(ssize_t rows, ssize_t cols) {
+  return {InputTensorSpec(DT_FLOAT, {rows, cols})};
+}
+
+#define BM(FN) BM_##FN->Arg(0);
+
+#define BM_SUITE(NAME, DYNAMIC_ROW, DYNAMIC_COL, ROWS, COLS)                 \
+  BM(CpurtV(NAME, Softmax({DYNAMIC_ROW, DYNAMIC_COL}, {ROWS, COLS}), "main", \
+            Inputs(ROWS, COLS)));                                            \
+  BM(Eigen(NAME, EigenSoftmax(), Inputs(ROWS, COLS)));                       \
+  BM(Tfrt(NAME, Softmax({DYNAMIC_ROW, DYNAMIC_COL}, {ROWS, COLS}), "main",   \
+          Inputs(ROWS, COLS)))
+
+#define BM_DYNAMIC_ALL(ROWS, COLS)                                            \
+  BM_SUITE(SoftmaxDynamicAll_##ROWS##_##COLS, kDynamicDim, kDynamicDim, ROWS, \
+           COLS)
+BM_DYNAMIC_ALL(2, 80);
+BM_DYNAMIC_ALL(8, 6);
+BM_DYNAMIC_ALL(80, 1);
+BM_DYNAMIC_ALL(80, 60);
+BM_DYNAMIC_ALL(81, 61);
+BM_DYNAMIC_ALL(800, 600);
+BM_DYNAMIC_ALL(802, 602);
+
+#define BM_STATIC_ROW(ROWS, COLS) \
+  BM_SUITE(SoftmaxStaticRow##ROWS##_##COLS, kStaticDim, kDynamicDim, ROWS, COLS)
+BM_STATIC_ROW(2, 80);
+BM_STATIC_ROW(8, 6);
+BM_STATIC_ROW(80, 1);
+BM_STATIC_ROW(80, 60);
+BM_STATIC_ROW(81, 61);
+BM_STATIC_ROW(800, 600);
+BM_STATIC_ROW(802, 602);
+
+#define BM_STATIC_COL(ROWS, COLS)                                           \
+  BM_SUITE(SoftmaxStaticCol_##ROWS##_##COLS, kDynamicDim, kStaticDim, ROWS, \
+           COLS)
+BM_STATIC_COL(2, 80);
+BM_STATIC_COL(8, 6);
+BM_STATIC_COL(80, 1);
+BM_STATIC_COL(80, 60);
+BM_STATIC_COL(81, 61);
+BM_STATIC_COL(800, 600);
+BM_STATIC_COL(802, 602);
+
+#define BM_STATIC_ALL(ROWS, COLS) \
+  BM_SUITE(SoftmaxStaticAll_##ROWS##_##COLS, kStaticDim, kStaticDim, ROWS, COLS)
+BM_STATIC_ALL(2, 80);
+BM_STATIC_ALL(8, 6);
+BM_STATIC_ALL(80, 1);
+BM_STATIC_ALL(80, 60);
+BM_STATIC_ALL(81, 61);
+BM_STATIC_ALL(800, 600);
+BM_STATIC_ALL(802, 602);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/sum_1d_op_benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/sum_1d_op_benchmark.cc
new file mode 100644
index 00000000000000..f781ea6724c0c6
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/sum_1d_op_benchmark.cc
@@ -0,0 +1,77 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h"
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.h"
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+std::string Sum1D(bool dynamic, int32_t size) {
+  return GetTFSumIR({size}, {dynamic}, {0});
+}
+
+auto EigenSum1D() {
+  return [](llvm::ArrayRef<Tensor> inputs,
+            llvm::Optional<Eigen::ThreadPoolDevice> device) {
+    std::array<int64_t, 1> dims_to_reduce{0};
+    Tensor output(DT_FLOAT, {});
+
+    auto in = inputs[0].tensor<float, 1>();
+    auto out = output.tensor<float, 0>();
+    out.setZero();
+
+    if (device.hasValue()) {
+      out.device(*device) = in.sum(dims_to_reduce);
+    } else {
+      out = in.sum(dims_to_reduce);
+    }
+  };
+}
+
+llvm::SmallVector<InputTensorSpec> Inputs(ssize_t dim) {
+  return {InputTensorSpec(DT_FLOAT, {dim})};
+}
+
+#define BM(FN) BM_##FN->Arg(0);
+
+#define BM_SUITE(NAME, DYNAMIC, SIZE)                           \
+  BM(CpurtV(NAME, Sum1D(DYNAMIC, SIZE), "main", Inputs(SIZE))); \
+  BM(Eigen(NAME, EigenSum1D(), Inputs(SIZE)));                  \
+  BM(Tfrt(NAME, Sum1D(kDynamicDim, SIZE), "main", Inputs(SIZE)))
+
+#define BM_DYNAMIC(SIZE) BM_SUITE(SumDynamic_##SIZE, kDynamicDim, SIZE)
+BM_DYNAMIC(3);
+BM_DYNAMIC(8);
+BM_DYNAMIC(80);
+BM_DYNAMIC(800);
+BM_DYNAMIC(8000);
+BM_DYNAMIC(8131);
+BM_DYNAMIC(1000000);
+BM_DYNAMIC(1010131);
+
+#define BM_STATIC(SIZE) BM_SUITE(SumStatic_##SIZE, kStaticDim, SIZE)
+BM_STATIC(3);
+BM_STATIC(8);
+BM_STATIC(80);
+BM_STATIC(800);
+BM_STATIC(8000);
+BM_STATIC(8131);
+BM_STATIC(1000000);
+BM_STATIC(1010131);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/sum_2d_op_benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/sum_2d_op_benchmark.cc
new file mode 100644
index 00000000000000..f56f0d5b10f679
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/sum_2d_op_benchmark.cc
@@ -0,0 +1,102 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h"
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.h"
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+std::string Sum2D(bool dynamic_row, bool dynamic_col, int32_t rows,
+                  int32_t cols) {
+  return GetTFSumIR({rows, cols}, {dynamic_row, dynamic_col}, {0, 1});
+}
+
+auto EigenSum2D() {
+  return [](llvm::ArrayRef<Tensor> inputs,
+            llvm::Optional<Eigen::ThreadPoolDevice> device) {
+    Tensor output(DT_FLOAT, {});
+
+    auto in = inputs[0].tensor<float, 2>();
+    auto out = output.tensor<float, 0>();
+    out.setZero();
+
+    std::array<int64_t, 2> dims_to_reduce{0, 1};
+    if (device.hasValue()) {
+      out.device(*device) = in.sum(dims_to_reduce);
+    } else {
+      out = in.sum(dims_to_reduce);
+    }
+  };
+}
+
+llvm::SmallVector<InputTensorSpec> Inputs(ssize_t rows, ssize_t cols) {
+  return {InputTensorSpec(DT_FLOAT, {rows, cols})};
+}
+
+#define BM(FN) BM_##FN->Arg(0);
+
+#define BM_SUITE(NAME, DYNAMIC_ROW, DYNAMIC_COL, ROWS, COLS)           \
+  BM(CpurtV(NAME, Sum2D(DYNAMIC_ROW, DYNAMIC_COL, ROWS, COLS), "main", \
+            Inputs(ROWS, COLS)));                                      \
+  BM(Eigen(NAME, EigenSum2D(), Inputs(ROWS, COLS)));                   \
+  BM(Tfrt(NAME, Sum2D(DYNAMIC_ROW, DYNAMIC_COL, ROWS, COLS), "main",   \
+          Inputs(ROWS, COLS)))
+
+// TODO(b/207822945): Enable after reduction grouper pass is implemented.
+#define BM_DYNAMIC_ALL(ROWS, COLS)                                          \
+  BM_SUITE(Sum2DDynamicAll_##ROWS##_##COLS, kDynamicDim, kDynamicDim, ROWS, \
+           COLS)
+BM_DYNAMIC_ALL(2, 80);
+// BM_DYNAMIC_ALL(8, 6);
+// BM_DYNAMIC_ALL(80, 1);
+// BM_DYNAMIC_ALL(80, 60);
+// BM_DYNAMIC_ALL(81, 61);
+// BM_DYNAMIC_ALL(800, 600);
+// BM_DYNAMIC_ALL(802, 602);
+
+#define BM_STATIC_ROW(ROWS, COLS) \
+  BM_SUITE(Sum2DStaticRow##ROWS##_##COLS, kStaticDim, kDynamicDim, ROWS, COLS)
+// BM_STATIC_ROW(2, 80);
+// BM_STATIC_ROW(8, 6);
+// BM_STATIC_ROW(80, 1);
+// BM_STATIC_ROW(80, 60);
+// BM_STATIC_ROW(81, 61);
+// BM_STATIC_ROW(800, 600);
+// BM_STATIC_ROW(802, 602);
+
+#define BM_STATIC_COL(ROWS, COLS) \
+  BM_SUITE(Sum2DStaticCol_##ROWS##_##COLS, kDynamicDim, kStaticDim, ROWS, COLS)
+// BM_STATIC_COL(2, 80);
+// BM_STATIC_COL(8, 6);
+// BM_STATIC_COL(80, 1);
+// BM_STATIC_COL(80, 60);
+// BM_STATIC_COL(81, 61);
+// BM_STATIC_COL(800, 600);
+// BM_STATIC_COL(802, 602);
+
+#define BM_STATIC_ALL(ROWS, COLS) \
+  BM_SUITE(Sum2DStaticAll_##ROWS##_##COLS, kStaticDim, kStaticDim, ROWS, COLS)
+// BM_STATIC_ALL(2, 80);
+// BM_STATIC_ALL(8, 6);
+// BM_STATIC_ALL(80, 1);
+// BM_STATIC_ALL(80, 60);
+// BM_STATIC_ALL(81, 61);
+// BM_STATIC_ALL(800, 600);
+// BM_STATIC_ALL(802, 602);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/sum_col_op_benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/sum_col_op_benchmark.cc
new file mode 100644
index 00000000000000..412ccfaaf629d5
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/sum_col_op_benchmark.cc
@@ -0,0 +1,101 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h"
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.h"
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+std::string SumColumn(bool dynamic_row, bool dynamic_col, int32_t rows,
+                      int32_t cols) {
+  return GetTFSumIR({rows, cols}, {dynamic_row, dynamic_col}, {0});
+}
+
+auto EigenSumColumn() {
+  return [](llvm::ArrayRef<Tensor> inputs,
+            llvm::Optional<Eigen::ThreadPoolDevice> device) {
+    Tensor output(DT_FLOAT, {inputs[0].dim_size(1)});
+
+    auto in = inputs[0].tensor<float, 2>();
+    auto out = output.tensor<float, 1>();
+    out.setZero();
+
+    std::array<int64_t, 1> dims_to_reduce{0};
+    if (device.hasValue()) {
+      out.device(*device) = in.sum(dims_to_reduce);
+    } else {
+      out = in.sum(dims_to_reduce);
+    }
+  };
+}
+
+llvm::SmallVector<InputTensorSpec> Inputs(ssize_t rows, ssize_t cols) {
+  return {InputTensorSpec(DT_FLOAT, {rows, cols})};
+}
+
+#define BM(FN) BM_##FN->Arg(0);
+
+#define BM_SUITE(NAME, DYNAMIC_ROW, DYNAMIC_COL, ROWS, COLS)               \
+  BM(CpurtV(NAME, SumColumn(DYNAMIC_ROW, DYNAMIC_COL, ROWS, COLS), "main", \
+            Inputs(ROWS, COLS)));                                          \
+  BM(Eigen(NAME, EigenSumColumn(), Inputs(ROWS, COLS)));                   \
+  BM(Tfrt(NAME, SumColumn(DYNAMIC_ROW, DYNAMIC_COL, ROWS, COLS), "main",   \
+          Inputs(ROWS, COLS)))
+
+#define BM_DYNAMIC_ALL(ROWS, COLS)                                           \
+  BM_SUITE(SumColDynamicAll_##ROWS##_##COLS, kDynamicDim, kDynamicDim, ROWS, \
+           COLS)
+BM_DYNAMIC_ALL(2, 80);
+BM_DYNAMIC_ALL(8, 6);
+BM_DYNAMIC_ALL(80, 1);
+BM_DYNAMIC_ALL(80, 60);
+BM_DYNAMIC_ALL(81, 61);
+BM_DYNAMIC_ALL(800, 600);
+BM_DYNAMIC_ALL(802, 602);
+
+#define BM_STATIC_ROW(ROWS, COLS) \
+  BM_SUITE(SumColStaticRow##ROWS##_##COLS, kStaticDim, kDynamicDim, ROWS, COLS)
+BM_STATIC_ROW(2, 80);
+BM_STATIC_ROW(8, 6);
+BM_STATIC_ROW(80, 1);
+BM_STATIC_ROW(80, 60);
+BM_STATIC_ROW(81, 61);
+BM_STATIC_ROW(800, 600);
+BM_STATIC_ROW(802, 602);
+
+#define BM_STATIC_COL(ROWS, COLS) \
+  BM_SUITE(SumColStaticCol_##ROWS##_##COLS, kDynamicDim, kStaticDim, ROWS, COLS)
+BM_STATIC_COL(2, 80);
+BM_STATIC_COL(8, 6);
+BM_STATIC_COL(80, 1);
+BM_STATIC_COL(80, 60);
+BM_STATIC_COL(81, 61);
+BM_STATIC_COL(800, 600);
+BM_STATIC_COL(802, 602);
+
+#define BM_STATIC_ALL(ROWS, COLS) \
+  BM_SUITE(SumColStaticAll_##ROWS##_##COLS, kStaticDim, kStaticDim, ROWS, COLS)
+BM_STATIC_ALL(2, 80);
+BM_STATIC_ALL(8, 6);
+BM_STATIC_ALL(80, 1);
+BM_STATIC_ALL(80, 60);
+BM_STATIC_ALL(81, 61);
+BM_STATIC_ALL(800, 600);
+BM_STATIC_ALL(802, 602);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/sum_row_op_benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/sum_row_op_benchmark.cc
new file mode 100644
index 00000000000000..6937e89ac8de54
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/sum_row_op_benchmark.cc
@@ -0,0 +1,101 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h"
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.h"
+#include "tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.h"
+
+namespace tensorflow {
+namespace {
+
+std::string SumRow(bool dynamic_row, bool dynamic_col, int32_t rows,
+                   int32_t cols) {
+  return GetTFSumIR({rows, cols}, {dynamic_row, dynamic_col}, {1});
+}
+
+auto EigenSumRow() {
+  return [](llvm::ArrayRef<Tensor> inputs,
+            llvm::Optional<Eigen::ThreadPoolDevice> device) {
+    auto in = inputs[0].tensor<float, 2>();
+
+    Tensor output(DT_FLOAT, {inputs[0].dim_size(0)});
+    auto out = output.tensor<float, 1>();
+    out.setZero();
+
+    std::array<int64_t, 1> dims_to_reduce{1};
+    if (device.hasValue()) {
+      out.device(*device) = in.sum(dims_to_reduce);
+    } else {
+      out = in.sum(dims_to_reduce);
+    }
+  };
+}
+
+llvm::SmallVector<InputTensorSpec> Inputs(ssize_t rows, ssize_t cols) {
+  return {InputTensorSpec(DT_FLOAT, {rows, cols})};
+}
+
+#define BM(FN) BM_##FN->Arg(0);
+
+#define BM_SUITE(NAME, DYNAMIC_ROW, DYNAMIC_COL, ROWS, COLS)            \
+  BM(CpurtV(NAME, SumRow(DYNAMIC_ROW, DYNAMIC_COL, ROWS, COLS), "main", \
+            Inputs(ROWS, COLS)));                                       \
+  BM(Eigen(NAME, EigenSumRow(), Inputs(ROWS, COLS)));                   \
+  BM(Tfrt(NAME, SumRow(DYNAMIC_ROW, DYNAMIC_COL, ROWS, COLS), "main",   \
+          Inputs(ROWS, COLS)))
+
+#define BM_DYNAMIC_ALL(ROWS, COLS)                                           \
+  BM_SUITE(SumRowDynamicAll_##ROWS##_##COLS, kDynamicDim, kDynamicDim, ROWS, \
+           COLS)
+BM_DYNAMIC_ALL(2, 80);
+BM_DYNAMIC_ALL(8, 6);
+BM_DYNAMIC_ALL(80, 1);
+BM_DYNAMIC_ALL(80, 60);
+BM_DYNAMIC_ALL(81, 61);
+BM_DYNAMIC_ALL(800, 600);
+BM_DYNAMIC_ALL(802, 602);
+
+#define BM_STATIC_ROW(ROWS, COLS) \
+  BM_SUITE(SumRowStaticRow##ROWS##_##COLS, kStaticDim, kDynamicDim, ROWS, COLS)
+BM_STATIC_ROW(2, 80);
+BM_STATIC_ROW(8, 6);
+BM_STATIC_ROW(80, 1);
+BM_STATIC_ROW(80, 60);
+BM_STATIC_ROW(81, 61);
+BM_STATIC_ROW(800, 600);
+BM_STATIC_ROW(802, 602);
+
+#define BM_STATIC_COL(ROWS, COLS) \
+  BM_SUITE(SumRowStaticCol_##ROWS##_##COLS, kDynamicDim, kStaticDim, ROWS, COLS)
+BM_STATIC_COL(2, 80);
+BM_STATIC_COL(8, 6);
+BM_STATIC_COL(80, 1);
+BM_STATIC_COL(80, 60);
+BM_STATIC_COL(81, 61);
+BM_STATIC_COL(800, 600);
+BM_STATIC_COL(802, 602);
+
+#define BM_STATIC_ALL(ROWS, COLS) \
+  BM_SUITE(SumRowStaticAll_##ROWS##_##COLS, kStaticDim, kStaticDim, ROWS, COLS)
+BM_STATIC_ALL(2, 80);
+BM_STATIC_ALL(8, 6);
+BM_STATIC_ALL(80, 1);
+BM_STATIC_ALL(80, 60);
+BM_STATIC_ALL(81, 61);
+BM_STATIC_ALL(800, 600);
+BM_STATIC_ALL(802, 602);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/transpose_op_benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/transpose_op_benchmark.cc
index 2007961e18ebb1..f5512109036934 100644
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/transpose_op_benchmark.cc
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/transpose_op_benchmark.cc
@@ -13,40 +13,48 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <array>
+#include <string>
+
+#include "llvm/Support/FormatVariadic.h"
 #include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.h"
 
 namespace tensorflow {
 
 static const char* mlir_input = R"(
-func @compute(%arg0: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+func @compute(%arg0: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {{
     %0 = "tf.Const"()
-         {value = dense<[0, 2, 1]> : tensor<3xi64>,
+         {{value = dense<[{0}, {1}, {2}]> : tensor<3xi64>,
           device = "/job:localhost/replica:0/task:0/device:CPU:0"}
          : () -> tensor<3xi64>
     %1 = "tf.Transpose"(%arg0, %0)
-         {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+         {{device = "/job:localhost/replica:0/task:0/device:CPU:0"}
          : (tensor<?x?x?xf32>, tensor<3xi64>) -> tensor<?x?x?xf32>
     return %1 : tensor<?x?x?xf32>
   }
 )";
 
-static void Shuffle(llvm::ArrayRef<Tensor> inputs,
-                    llvm::Optional<Eigen::ThreadPoolDevice> device) {
-  std::array<int64_t, 3> perm = {0, 2, 1};
+static std::string Transpose(std::array<int32_t, 3> perm) {
+  return llvm::formatv(mlir_input, perm[0], perm[1], perm[2]);
+}
 
-  std::array<int64_t, 3> shuffled;
-  for (unsigned d = 0; d < 3; d++) shuffled[d] = inputs[0].dim_size(perm[d]);
+static auto Shuffle(std::array<int32_t, 3> perm) {
+  return [perm](llvm::ArrayRef<Tensor> inputs,
+                llvm::Optional<Eigen::ThreadPoolDevice> device) {
+    std::array<int64_t, 3> shuffled;
+    for (unsigned d = 0; d < 3; d++) shuffled[d] = inputs[0].dim_size(perm[d]);
 
-  Tensor output(DT_FLOAT, TensorShape(shuffled));
+    Tensor output(DT_FLOAT, TensorShape(shuffled));
 
-  auto in0 = inputs[0].tensor<float, 3>();
-  auto out0 = output.tensor<float, 3>();
+    auto in0 = inputs[0].tensor<float, 3>();
+    auto out0 = output.tensor<float, 3>();
 
-  if (device.hasValue()) {
-    out0.device(*device) = in0.shuffle(perm);
-  } else {
-    out0 = in0.shuffle(perm);
-  }
+    if (device.hasValue()) {
+      out0.device(*device) = in0.shuffle(perm);
+    } else {
+      out0 = in0.shuffle(perm);
+    }
+  };
 }
 
 static llvm::SmallVector<InputTensorSpec> Inputs(ssize_t dim) {
@@ -55,9 +63,28 @@ static llvm::SmallVector<InputTensorSpec> Inputs(ssize_t dim) {
 
 #define BM(FN) BM_##FN->Arg(0)->Arg(4)->Arg(8);
 
-BM(Cpurt(Transpose, mlir_input, "compute", Inputs(256)));
-BM(CpurtVectorized(Transpose, mlir_input, "compute", Inputs(256)));
-BM(Tfrt(Transpose, mlir_input, "compute", Inputs(256)));
-BM(Eigen(Transpose, Shuffle, Inputs(256)));
+// Transpose: [0, 2, 1]
+BM(Cpurt(Transpose_0x2x1, Transpose({0, 2, 1}), "compute", Inputs(256)));
+BM(CpurtV(Transpose_0x2x1, Transpose({0, 2, 1}), "compute", Inputs(256)));
+BM(Tfrt(Transpose_0x2x1, Transpose({0, 2, 1}), "compute", Inputs(256)));
+BM(Eigen(Transpose_0x2x1, Shuffle({0, 2, 1}), Inputs(256)));
+
+// Transpose: [2, 0, 1]
+BM(Cpurt(Transpose_2x0x1, Transpose({2, 0, 1}), "compute", Inputs(256)));
+BM(CpurtV(Transpose_2x0x1, Transpose({2, 0, 1}), "compute", Inputs(256)));
+BM(Tfrt(Transpose_2x0x1, Transpose({2, 0, 1}), "compute", Inputs(256)));
+BM(Eigen(Transpose_2x0x1, Shuffle({2, 0, 1}), Inputs(256)));
+
+// Transpose: [2, 1, 0]
+BM(Cpurt(Transpose_2x1x0, Transpose({2, 1, 0}), "compute", Inputs(256)));
+BM(CpurtV(Transpose_2x1x0, Transpose({2, 1, 0}), "compute", Inputs(256)));
+BM(Tfrt(Transpose_2x1x0, Transpose({2, 1, 0}), "compute", Inputs(256)));
+BM(Eigen(Transpose_2x1x0, Shuffle({2, 1, 0}), Inputs(256)));
+
+// Transpose: [1, 2, 0]
+BM(Cpurt(Transpose_1x2x0, Transpose({1, 2, 0}), "compute", Inputs(256)));
+BM(CpurtV(Transpose_1x2x0, Transpose({1, 2, 0}), "compute", Inputs(256)));
+BM(Tfrt(Transpose_1x2x0, Transpose({1, 2, 0}), "compute", Inputs(256)));
+BM(Eigen(Transpose_1x2x0, Shuffle({1, 2, 0}), Inputs(256)));
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_cpurt_ops.cc b/tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_cpurt_ops.cc
index c486d40cbaa259..4d3b84e828f427 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_cpurt_ops.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_cpurt_ops.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_cpurt_ops.h"
 
+#include <algorithm>
+
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/OpDefinition.h"
@@ -30,7 +32,7 @@ namespace tf_cpurt {
 // CpuRuntimeDialect Dialect
 //===----------------------------------------------------------------------===//
 
-CpuRuntimeDialect::CpuRuntimeDialect(mlir::MLIRContext *context)
+CpuRuntimeDialect::CpuRuntimeDialect(mlir::MLIRContext* context)
     : Dialect(/*name*/ "tf_cpurt", context,
               mlir::TypeID::get<CpuRuntimeDialect>()) {
   addOperations<
@@ -39,6 +41,46 @@ CpuRuntimeDialect::CpuRuntimeDialect(mlir::MLIRContext *context)
       >();
 }
 
+// Computes the number of elements in the tensor type. Optimistically use `1` as
+// a size of all unknown dimensions. These heuristics match cost estimates of
+// the fallback_async::ExecuteOp operations.
+static int64_t GetRankedTensorSize(TensorType tensor) {
+  assert(tensor.hasRank() && "shape must be ranked");
+  if (!tensor.hasRank()) return 0;
+
+  int64_t size = 1;  // scalars (rank 0) have size 1
+  for (int64_t dim : tensor.getShape()) size *= std::max<int64_t>(1, dim);
+  return size;
+}
+
+int64_t FallbackExecuteOp::cost() {
+  Operation* self = getOperation();
+
+  // Find the referenced kernel function.
+  auto kernel_fn = SymbolTable::lookupNearestSymbolFrom<FuncOp>(self, kernel());
+  if (!kernel_fn) return 1;
+
+  int64_t cost = 0;
+
+  // Get the sum of sizes of all ranked inputs.
+  //
+  // TODO(ezhulenev): Once we have a proper cost model for MLIR operations,
+  // use it to compute a more precise cost estimation.
+  for (Type type : kernel_fn.getArgumentTypes()) {
+    TensorType tensor = type.dyn_cast<TensorType>();
+    if (!tensor || !tensor.hasRank()) continue;
+
+    cost += GetRankedTensorSize(tensor);
+  }
+
+  // Scale the cost by the number of operations in the function body. The choice
+  // of log2 function is arbitrary, seems to work well in benchmarks.
+  double scale = std::log2(kernel_fn.body().front().getOperations().size());
+  cost *= scale;
+
+  return std::max<int64_t>(1, cost);
+}
+
 }  // namespace tf_cpurt
 }  // end namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_cpurt_ops.td b/tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_cpurt_ops.td
index f4e093d1322c76..5311d298fad8d6 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_cpurt_ops.td
+++ b/tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_cpurt_ops.td
@@ -54,14 +54,18 @@ def FallbackCompileOp : TF_CPURT_Op<"fallback.compile",
     [TFRT_CostFunctionInterface, TFRT_FixedCost<1>]> {
   let summary = "compiles kernel at runtime using LLVM JIT compiler";
   let description = [{
-    `tf_cpurt.fallback.compile` compiles a Tensorflow program defined by the
-    kernel function in the nested module to the CPURT JIT executable using
-    LLVM JIT APIs and caches it in the JitExecutableCache owned by the
-    resource context.
+    `tf_cpurt.fallback.compile` schedules the compilation of a Tensorflow
+    program defined by the kernel function in the nested module to the CPURT JIT
+    executable using LLVM JIT APIs and caches it in the JitExecutableCache owned
+    by the resource context.
+
+    Compilation happens asynchronously by launching compilation task into the
+    dedicated thread pool, and the kernel returns an available chain once the
+    task is scheduled.
 
     This kernel can be used in the init function to make sure that when execute
-    kernel (see definition below) called at runtime, the compiled kernel will be
-    already pre-compiled and ready to run.
+    kernel (see definition below) called at runtime, the default compiled kernel
+    will be already pre-compiled and ready to run.
     ```
   }];
 
@@ -70,7 +74,7 @@ def FallbackCompileOp : TF_CPURT_Op<"fallback.compile",
     StrAttr:$device
   );
 
-  let results = (outs TFRT_ChainType:$compiled);
+  let results = (outs TFRT_ChainType:$scheduled);
 
   let assemblyFormat = [{
     $kernel `device` `(` $device `)` attr-dict
@@ -78,7 +82,7 @@ def FallbackCompileOp : TF_CPURT_Op<"fallback.compile",
 }
 
 def FallbackExecuteOp : TF_CPURT_Op<"fallback.execute",
-    [TFRT_CostFunctionInterface, TFRT_FixedCost<1>]> {
+    [DeclareOpInterfaceMethods<TFRT_CostFunctionInterface>]> {
   let summary = "cpurt execute operation with a fallback runtime interop";
   let description = [{
     `tf_cpurt.fallback.execute` compiles a Tensorflow program defined by the
diff --git a/tensorflow/compiler/mlir/tfrt/jit/python_binding/BUILD b/tensorflow/compiler/mlir/tfrt/jit/python_binding/BUILD
index deed42e9053796..cd58526f41a56a 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/python_binding/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/jit/python_binding/BUILD
@@ -43,7 +43,7 @@ pybind_extension(
         "//third_party/eigen3",
         "//third_party/python_runtime:headers",  # build_cleaner: keep
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Transforms",
+        "@llvm-project//mlir:BufferizationTransforms",
         "@llvm-project//mlir:mlir_c_runner_utils",
         "@pybind11",
         "@tf_runtime//:dtype",
diff --git a/tensorflow/compiler/mlir/tfrt/jit/python_binding/tf_cpurt_executor.cc b/tensorflow/compiler/mlir/tfrt/jit/python_binding/tf_cpurt_executor.cc
index 5ce3f13365783d..582deaa6152e61 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/python_binding/tf_cpurt_executor.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/python_binding/tf_cpurt_executor.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/ExecutionEngine/CRunnerUtils.h"
-#include "mlir/Transforms/Bufferize.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tfrt/jit/tf_cpurt_pipeline.h"
@@ -90,7 +90,7 @@ TfCpurtExecutor::Handle TfCpurtExecutor::Compile(const std::string& mlir_module,
     tensorflow::CreateTfCpuRtPipeline(pm, opts);
   };
   opts.specialization = specialization;
-  opts.type_converter = mlir::BufferizeTypeConverter();
+  opts.type_converter = mlir::bufferization::BufferizeTypeConverter();
 
   // Instantiate new JitExecutable from the MLIR source.
   llvm::Expected<JitExecutable> jit_executable =
diff --git a/tensorflow/compiler/mlir/tfrt/jit/tf_cpurt_kernels.cc b/tensorflow/compiler/mlir/tfrt/jit/tf_cpurt_kernels.cc
index 3c23052912a09d..406e1c2808e52e 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/tf_cpurt_kernels.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/tf_cpurt_kernels.cc
@@ -15,12 +15,13 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
+#include <memory>
 #include <string>
 #include <utility>
 
 #include "mlir/Dialect/Async/IR/AsyncTypes.h"
+#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/ExecutionEngine/AsyncRuntime.h"
-#include "mlir/Transforms/Bufferize.h"
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tfrt/jit/tf_cpurt.h"
 #include "tensorflow/compiler/mlir/tfrt/jit/tf_cpurt_pipeline.h"
@@ -29,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/threadpool.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
 #include "tensorflow/core/tfrt/utils/fallback_tensor.h"
@@ -44,6 +46,7 @@ limitations under the License.
 #include "tfrt/host_context/host_context.h"  // from @tf_runtime
 #include "tfrt/host_context/kernel_registry.h"  // from @tf_runtime
 #include "tfrt/host_context/kernel_utils.h"  // from @tf_runtime
+#include "tfrt/host_context/shared_context.h"  // from @tf_runtime
 #include "tfrt/support/error_util.h"  // from @tf_runtime
 #include "tfrt/support/forward_decls.h"  // from @tf_runtime
 #include "tfrt/support/rc_array.h"  // from @tf_runtime
@@ -70,11 +73,11 @@ using ::tfrt::Attribute;
 using ::tfrt::Chain;
 using ::tfrt::CompilationUnitAttribute;
 using ::tfrt::DType;
-using ::tfrt::EnqueueWork;
 using ::tfrt::ExecutionContext;
+using ::tfrt::HostContext;
 using ::tfrt::IndirectAsyncValue;
 using ::tfrt::KernelRegistry;
-using ::tfrt::MakeConstructedAsyncValueRef;
+using ::tfrt::MakeAvailableAsyncValueRef;
 using ::tfrt::MakeErrorAsyncValueRef;
 using ::tfrt::MakeStringError;
 using ::tfrt::RCArray;
@@ -82,6 +85,7 @@ using ::tfrt::RCReference;
 using ::tfrt::RemainingResults;
 using ::tfrt::RepeatedArguments;
 using ::tfrt::RequestContext;
+using ::tfrt::SharedContext;
 using ::tfrt::StrCat;
 using ::tfrt::StringAttribute;
 using ::tfrt::TaskFunction;
@@ -97,11 +101,43 @@ using ::tfrt::cpu::jit::ReturnAsyncStridedMemref;
 using ::tfrt::cpu::jit::ReturnStridedMemref;
 using ::tfrt::cpu::jit::ReturnValueConverter;
 
+using ::tensorflow::Env;
+using ::tensorflow::thread::ThreadPool;
+
 using ::tensorflow::profiler::TraceMe;
 using ::tensorflow::profiler::TraceMeEncode;
 using ::tensorflow::tfd::KernelFallbackCompatRequestState;
 using ::tensorflow::tfrt_stub::FallbackTensor;
 
+// -------------------------------------------------------------------------- //
+// Dedicated thread pool for running compilation tasks.
+// -------------------------------------------------------------------------- //
+
+class CompilationThreadPool : public SharedContext {
+ public:
+  explicit CompilationThreadPool(HostContext* host)
+      : thread_pool_(Env::Default(), "tf-cpurt-compiler", /*num_threads=*/16) {}
+
+  static CompilationThreadPool& Get(const ExecutionContext& exec_ctx) {
+    return exec_ctx.host()->GetOrCreateSharedContext<CompilationThreadPool>();
+  }
+
+  template <typename Task>
+  void Schedule(Task&& task) {
+    // Because compilation tasks can capture move only types, and Tensorflow
+    // thread pool requires std::function tasks, we have to do manual memory
+    // management here.
+    auto ptr = std::make_unique<Task>(std::forward<Task>(task));
+    thread_pool_.Schedule([ptr = ptr.release()]() {
+      (*ptr)();
+      delete ptr;
+    });
+  }
+
+ private:
+  ThreadPool thread_pool_;
+};
+
 // -------------------------------------------------------------------------- //
 // JIT compiled kernels use Eigen ThreadPool managed by the kernel fallback as
 // an async runtime worker threads.
@@ -250,8 +286,8 @@ static Expected<AsyncValuePtr<JitExecutable>> CompileImpl(
   // events happen too often, it is a much larger problem than the excessive
   // tracing.
 
-  // Custom runner for compiling specializations that enqueues compilation task
-  // into the host context work queue and adds tracing.
+  // Custom runner for compiling specializations that schedules compilation task
+  // into the dedicated thread pool and adds tracing.
   auto runner = [kernel, request_id](size_t num_specializations,
                                      ArrayRef<OperandConstraint> constraints,
                                      ArrayRef<MemrefDesc> operands,
@@ -276,17 +312,18 @@ static Expected<AsyncValuePtr<JitExecutable>> CompileImpl(
                         AsTensorContent(operands[i]));
     }
 
-    // TODO(ezhulenev): BEF file that owns the CompilationUnitAttribute in
-    // theory can be unloaded before the completion of the compilation task.
-    // It can't happen right now, because we require specialized compilation to
-    // finish before returning the response, however for safety tracing
-    // attributes that require the kernel attribute should be constructed in the
-    // caller thread.
-
-    // Run the actual compilation asynchronously without blocking the caller.
-    EnqueueWork(exec_ctx, [request_id, kernel, num_specializations,
-                           compile = std::move(compile),
-                           args = std::move(args)]() mutable {
+    // Schedule specialization compilation task into the dedicated thread pool.
+    CompilationThreadPool& thread_pool = CompilationThreadPool::Get(exec_ctx);
+
+    thread_pool.Schedule([request_id, kernel, num_specializations,
+                          compile = std::move(compile),
+                          args = std::move(args)]() mutable {
+      // TODO(ezhulenev): BEF file that owns the CompilationUnitAttribute in
+      // theory can be unloaded before the completion of the compilation task.
+      // It can't happen right now, because we require specialized compilation
+      // to finish before returning the response, however for safety tracing
+      // attributes that require the `kernel` attribute should be constructed in
+      // the caller thread.
       absl::string_view name(kernel.root_symbol().data(),
                              kernel.root_symbol().size());
       TraceMe trace_me([&] {
@@ -314,9 +351,11 @@ static Expected<AsyncValuePtr<JitExecutable>> CompileImpl(
     });
   };
 
-  // Compile kernel asynchronously in the host context thread pool.
-  EnqueueWork(exec_ctx, [kernel, request_id, runner, workers = *worker_threads,
-                         ptr = entry.ptr, tf_cpurt_opts = opts]() {
+  // Compile kernel asynchronously in the compilation thread pool.
+  CompilationThreadPool& thread_pool = CompilationThreadPool::Get(exec_ctx);
+
+  thread_pool.Schedule([kernel, request_id, runner, workers = *worker_threads,
+                        ptr = entry.ptr, tf_cpurt_opts = opts]() {
     TraceMe trace_me([&] {
       absl::string_view name(kernel.root_symbol().data(),
                              kernel.root_symbol().size());
@@ -334,7 +373,7 @@ static Expected<AsyncValuePtr<JitExecutable>> CompileImpl(
     // All entry memrefs must have alignment compatible with Tensorflow.
     opts.alignment = EIGEN_MAX_ALIGN_BYTES;  // Eigen included by tensor.h
     opts.num_worker_threads = workers->NumThreads();
-    opts.type_converter = mlir::BufferizeTypeConverter();
+    opts.type_converter = mlir::bufferization::BufferizeTypeConverter();
     opts.register_dialects = mlir::RegisterAllTensorFlowDialects;
 
     // Register a custom pipeline for lowering from Tensorflow dialect.
@@ -378,21 +417,12 @@ static AsyncValueRef<Chain> Compile(StringAttribute device,
   Expected<AsyncValuePtr<JitExecutable>> executable =
       CompileImpl(kernel, exec_ctx);
 
-  // Return immediately if can't compile the kernel.
+  // Return error if can't schedule the compilation task.
   if (auto err = executable.takeError())
     return MakeErrorAsyncValueRef(StrCat(err));
 
-  // Signal compilation completion using an async chain.
-  auto compiled = MakeConstructedAsyncValueRef<Chain>();
-
-  executable->AndThen([executable = *executable, res = compiled.CopyRef()]() {
-    if (executable.IsError())
-      res.SetError(executable.GetError());
-    else
-      res.SetStateConcrete();
-  });
-
-  return compiled;
+  // Immediately return an available chain once we schedule the compilation.
+  return MakeAvailableAsyncValueRef<Chain>();
 }
 
 // -------------------------------------------------------------------------- //
@@ -441,7 +471,7 @@ struct DebugListener : public JitExecutable::Listener {
     std::string message;
     llvm::raw_string_ostream os(message);
     os << "Specialized operands:\n";
-    for (auto tuple : llvm::enumerate(llvm::zip(operands, attrs))) {
+    for (auto& tuple : llvm::enumerate(llvm::zip(operands, attrs))) {
       mlir::Type type = std::get<0>(tuple.value());
       mlir::Attribute attr = std::get<1>(tuple.value());
       os << "%arg" << tuple.index() << ": " << type << " " << attr << "\n";
@@ -469,8 +499,12 @@ static void ExecuteImpl(Executable& executable,
   TraceMe trace_me([&] {
     int64_t id = exec_ctx.request_ctx()->id();
     absl::string_view name(executable.name().data(), executable.name().size());
-    return TraceMeEncode("tf_cpurt.Execute",
-                         {{"id", id}, {"executable", name}});
+    return TraceMeEncode(
+        "tf_cpurt.Execute",
+        {{"id", id},
+         {"executable", name},
+         {"specialized", executable.specialized() ? "true" : "false"},
+         {"num_worker_threads", executable.num_worker_threads()}});
   });
 
   // Keep track of memory address to tensor mapping for result conversion.
diff --git a/tensorflow/compiler/mlir/tfrt/jit/tf_cpurt_pipeline.cc b/tensorflow/compiler/mlir/tfrt/jit/tf_cpurt_pipeline.cc
index 88b5103f7eaa76..8982c48f72ffa9 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/tf_cpurt_pipeline.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/tf_cpurt_pipeline.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "mlir/Conversion/ShapeToStandard/ShapeToStandard.h"
 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"
 #include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/Shape/Transforms/Passes.h"
@@ -153,7 +154,8 @@ void CreateTfCpuRtPipeline(mlir::OpPassManager& pm,
   pm.addPass(mlir::createCanonicalizerPass());
 
   // Deallocate all temporary buffers.
-  pm.addNestedPass<mlir::FuncOp>(mlir::createBufferDeallocationPass());
+  pm.addNestedPass<mlir::FuncOp>(
+      mlir::bufferization::createBufferDeallocationPass());
 
   // Do trivial buffer forwarding across linalg.generic operations.
   pm.addNestedPass<mlir::FuncOp>(CreateLinalgTrivialBufferForwardingPass());
@@ -161,14 +163,6 @@ void CreateTfCpuRtPipeline(mlir::OpPassManager& pm,
   // Remove trivial copy operations.
   pm.addNestedPass<mlir::FuncOp>(CreateLinalgTrivialCopyRemovalPass());
 
-  // Specilize linalg.matmul to linalg.dot, linalg.matvec or linalg.vecmat, and
-  // immediately canonicalize to clean up not taken branches.
-  pm.addNestedPass<mlir::FuncOp>(CreateLinalgMatmulSpecializationPass());
-  pm.addPass(mlir::createCanonicalizerPass());
-
-  // Tile and vectorize linalg operation using Linalg Codegen Strategy.
-  pm.addNestedPass<mlir::FuncOp>(CreateCodegenStrategyForMatMulPass());
-
   if (options.vectorize) {
     pm.addNestedPass<mlir::FuncOp>(
         mlir::createConvertLinalgTiledLoopsToSCFPass());
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/BUILD b/tensorflow/compiler/mlir/tfrt/jit/transforms/BUILD
index 26f4477adcd48d..3a0ddcdc42d610 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/jit/transforms/BUILD
@@ -43,7 +43,6 @@ cc_library(
         "tf_cpurt_buffer_forwarding.cc",
         "tf_cpurt_clustering_pass.cc",
         "tf_cpurt_codegen_cwise.cc",
-        "tf_cpurt_codegen_matmul.cc",
         "tf_cpurt_codegen_reduction.cc",
         "tf_cpurt_copy_removal.cc",
         "tf_cpurt_detensorize_linalg.cc",
@@ -51,7 +50,6 @@ cc_library(
         "tf_cpurt_fusion.cc",
         "tf_cpurt_legalize_i1_type.cc",
         "tf_cpurt_math_approximation.cc",
-        "tf_cpurt_matmul_specialization.cc",
         "tf_cpurt_pad_tiled_ops.cc",
         "tf_cpurt_passes.cc",
         "tf_cpurt_peel_tiled_loops.cc",
@@ -72,6 +70,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:ArithmeticDialect",
         "@llvm-project//mlir:DialectUtils",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_clustering.h b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_clustering.h
index 3a9a966aa242f0..efea538a11f584 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_clustering.h
+++ b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_clustering.h
@@ -33,6 +33,9 @@ enum class CpurtClusteringTier : uint8_t {
   kMetadata = 0x4,    // shape, reshape, ...
   kReductions = 0x8,  // all, any, min, max, mean, prod, sum
 
+  // Only cwise operations (unary, binary, ternary).
+  kTier0 = kCwise,
+
   // All cwise operations (unary, binary, ternary) plus a tf.Transpose.
   kTier1 = kCwise | kTranspose,
 
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_clustering_pass.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_clustering_pass.cc
index 39167145dd0ce8..edfa6483c0a4f0 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_clustering_pass.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_clustering_pass.cc
@@ -48,7 +48,9 @@ struct ClusteringPass : public ClusteringBase<ClusteringPass> {
     llvm::Optional<CpurtClusteringTier> tier;
 
     for (const auto& op : oplist) {
-      if (op == "tier1") {
+      if (op == "tier0") {
+        tier = CpurtClusteringTier::kTier0;
+      } else if (op == "tier1") {
         tier = CpurtClusteringTier::kTier1;
       } else if (op == "tier1metadata") {
         tier = CpurtClusteringTier::kTier1Metadata;
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_codegen_matmul.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_codegen_matmul.cc
deleted file mode 100644
index b1ab8f6dc6fea4..00000000000000
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_codegen_matmul.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir/Dialect/Linalg/Transforms/CodegenStrategy.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_passes.h"
-
-namespace tensorflow {
-namespace {
-
-#define GEN_PASS_CLASSES
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_passes.h.inc"
-
-// Use Linalg CodegenStrategy to tile linalg.matmul, linalg.matvec and
-// linalg.vecmat operations.
-struct CodegenStrategyForMatMulPass
-    : public CodegenMatmulBase<CodegenStrategyForMatMulPass> {
-  void runOnFunction() override {
-    // Promote tiles to full buffers allocated on the stack.
-    mlir::linalg::LinalgPromotionOptions full_alloca_promotion;
-    full_alloca_promotion.setUseFullTileBuffersByDefault(true).setUseAlloca(
-        true);
-
-    // TODO(ezhulenev): Set up tiling options depending on the target machine.
-
-    // Tile and vectorize linalg.matmul operations.
-    mlir::linalg::LinalgTilingOptions matmul_tiling;
-    matmul_tiling.setTileSizes({12, 32, 16});
-
-    mlir::linalg::CodegenStrategy matmul_strategy;
-    matmul_strategy
-        .tile(mlir::linalg::MatmulOp::getOperationName(), matmul_tiling)
-        .promote(mlir::linalg::MatmulOp::getOperationName(),
-                 full_alloca_promotion)
-        .vectorize(mlir::linalg::MatmulOp::getOperationName());
-    // Created a nested OpPassManager, populate the strategy and run.
-    mlir::FuncOp f = getFunction();
-    mlir::OpPassManager dynamicPM("builtin.func");
-    matmul_strategy.configurePassPipeline(dynamicPM, f.getContext());
-    if (failed(runPipeline(dynamicPM, f))) return signalPassFailure();
-
-    // Tile and vectorize linalg.vecmat operations. Interchange loop order to
-    // linearly read from the matrix memref.
-    mlir::linalg::LinalgTilingOptions vecmat_tiling;
-    vecmat_tiling.setTileSizes({16, 8}).setInterchange({1, 0});
-
-    mlir::linalg::CodegenStrategy vecmat_strategy;
-    vecmat_strategy
-        .tile(mlir::linalg::VecmatOp::getOperationName(), vecmat_tiling)
-        .promote(mlir::linalg::VecmatOp::getOperationName(),
-                 full_alloca_promotion)
-        .vectorize(mlir::linalg::VecmatOp::getOperationName());
-    // Created a nested OpPassManager, populate the strategy and run.
-    mlir::OpPassManager dynamicPM2("builtin.func");
-    vecmat_strategy.configurePassPipeline(dynamicPM2, f.getContext());
-    if (failed(runPipeline(dynamicPM2, f))) return signalPassFailure();
-
-    // Vector contraction options.
-    mlir::vector::VectorTransformsOptions vector_transforms_ops;
-    vector_transforms_ops.setVectorTransformsOptions(
-        mlir::vector::VectorContractLowering::OuterProduct);
-
-    // Vector transfer options.
-    mlir::VectorTransferToSCFOptions vector_transfer_opts;
-    vector_transfer_opts.enableFullUnroll();
-
-    mlir::linalg::CodegenStrategy vector_lowering_strategy;
-    vector_lowering_strategy.vectorLowering(
-        mlir::linalg::LinalgVectorLoweringOptions()
-            .enableTransferPartialRewrite()
-            .enableContractionLowering()
-            .enableTransferToSCFConversion()
-            .setVectorTransformsOptions(vector_transforms_ops)
-            .setVectorTransferToSCFOptions(vector_transfer_opts));
-    // Created a nested OpPassManager, populate the strategy and run.
-    mlir::OpPassManager dynamicPM3("builtin.func");
-    vector_lowering_strategy.configurePassPipeline(dynamicPM3, f.getContext());
-    if (failed(runPipeline(dynamicPM3, f))) return signalPassFailure();
-  }
-
-  void getDependentDialects(mlir::DialectRegistry& registry) const override {
-    registry.insert<mlir::vector::VectorDialect>();
-  }
-};
-}  // namespace
-
-std::unique_ptr<mlir::FunctionPass> CreateCodegenStrategyForMatMulPass() {
-  return std::make_unique<CodegenStrategyForMatMulPass>();
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_codegen_reduction.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_codegen_reduction.cc
index e53d873a0837fb..26f8c5b0ee3dd5 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_codegen_reduction.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_codegen_reduction.cc
@@ -15,14 +15,17 @@ limitations under the License.
 
 #include <utility>
 
+#include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/Linalg/Transforms/CodegenStrategy.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_passes.h"
 
@@ -38,7 +41,9 @@ using mlir::BlockArgument;
 using mlir::cast;
 using mlir::dyn_cast;
 using mlir::failure;
+using mlir::FailureOr;
 using mlir::Identifier;
+using mlir::Location;
 using mlir::LogicalResult;
 using mlir::MLIRContext;
 using mlir::OpBuilder;
@@ -46,6 +51,8 @@ using mlir::Operation;
 using mlir::OpRewritePattern;
 using mlir::PatternRewriter;
 using mlir::RankedTensorType;
+using mlir::ShapedType;
+using mlir::SmallVector;
 using mlir::success;
 using mlir::Value;
 using mlir::ValueRange;
@@ -65,6 +72,16 @@ using mlir::linalg::YieldOp;
 using mlir::tensor::ExtractSliceOp;
 using mlir::tensor::InsertSliceOp;
 
+// Detects the combiner in the body of LinalgOp if any. Currently, only
+// ops with a single combiner are supported.
+FailureOr<Operation *> DetectCombiner(LinalgOp linalg_op) {
+  SmallVector<Operation *, 4> combiners;
+  if (!matchReduction(linalg_op.getRegionOutputArgs(), 0, combiners) ||
+      combiners.size() != 1)
+    return failure();
+  return combiners.front();
+}
+
 // Tiles a GenericOp that models a reduction and then fuses its inputs and
 // outputs. Currently, only the FillOp that initializes the output is fused into
 // the TiledLoopOp.
@@ -82,6 +99,7 @@ struct RowOrColumnReductionTilingPattern : public OpRewritePattern<GenericOp> {
     if (failed(filter.checkAndNotify(rewriter, linalg_op))) return failure();
 
     if (linalg_op.getNumOutputs() != 1) return failure();
+    if (linalg_op.getNumLoops() != 2) return failure();
 
     auto tiled_op = tileLinalgOp(rewriter, linalg_op, options);
     if (failed(tiled_op)) return failure();
@@ -222,29 +240,36 @@ struct RowOrColumnReductionTilingPattern : public OpRewritePattern<GenericOp> {
   //
   //   linalg.yield %insert_output_slice, %update_cloned_output
   // }
-  void CombineReducedTileWithOutput(PatternRewriter &rewriter,
-                                    LinalgOp tiled_op, Value partial_result,
-                                    ExtractSliceOp extract_output_slice,
-                                    InsertSliceOp insert_output_slice) const {
+  LogicalResult CombineReducedTileWithOutput(
+      PatternRewriter &rewriter, LinalgOp tiled_op, Value partial_result,
+      ExtractSliceOp extract_output_slice,
+      InsertSliceOp insert_output_slice) const {
     rewriter.setInsertionPointAfter(tiled_op);
     auto num_parallel_loops = tiled_op.getNumParallelLoops();
-    mlir::SmallVector<mlir::StringRef, 3> parallel_iter_types(
+    SmallVector<mlir::StringRef, 3> parallel_iter_types(
         num_parallel_loops, mlir::getParallelIteratorTypeName());
     auto id_map = rewriter.getMultiDimIdentityMap(num_parallel_loops);
 
+    auto combiner_or = DetectCombiner(tiled_op);
+    if (failed(combiner_or)) return failure();
+    Operation *combiner = combiner_or.getValue();
+
     auto accumulator = rewriter.create<GenericOp>(
         tiled_op.getLoc(), partial_result.getType(),
         makeArrayRef(partial_result),
         makeArrayRef(extract_output_slice.result()),
-        makeArrayRef({id_map, id_map}), parallel_iter_types);
+        makeArrayRef({id_map, id_map}), parallel_iter_types,
+        [&](OpBuilder &b, Location nested_loc, ValueRange args) {
+          BlockAndValueMapping bvm;
+          bvm.map(combiner->getOperands(), args);
+          Value result_val = b.clone(*combiner, bvm)->getResult(0);
+          b.create<YieldOp>(nested_loc, result_val);
+        });
 
-    auto reduce_tile = mlir::cast<GenericOp>(tiled_op);
-    BlockAndValueMapping bvm;
-    rewriter.cloneRegionBefore(reduce_tile.region(), accumulator.region(),
-                               accumulator.region().end(), bvm);
     rewriter.updateRootInPlace(insert_output_slice, [&]() {
       insert_output_slice.sourceMutable().assign(accumulator.getResult(0));
     });
+    return success();
   }
 
   // Unfortunaly, there is no way to modify the results of the loop inplace. So
@@ -297,8 +322,10 @@ struct RowOrColumnReductionTilingPattern : public OpRewritePattern<GenericOp> {
         CloneAndAppendInitTensorToTiledLoop(rewriter, fill, tiled_loop);
     FuseFill(rewriter, tiled_op, fill, loop_output_bb_arg, cloned_output_bb_arg,
              extract_output_slice, insert_output_slice);
-    CombineReducedTileWithOutput(rewriter, tiled_op, tiled_op_result,
-                                 extract_output_slice, insert_output_slice);
+    if (mlir::failed(CombineReducedTileWithOutput(
+            rewriter, tiled_op, tiled_op_result, extract_output_slice,
+            insert_output_slice)))
+      return failure();
 
     // Update the results.
     TiledLoopOp updated_loop =
@@ -365,13 +392,19 @@ struct OneDimReductionTilingPattern : public OpRewritePattern<GenericOp> {
   LogicalResult matchAndRewrite(GenericOp linalg_op,
                                 PatternRewriter &rewriter) const override {
     if (failed(filter.checkAndNotify(rewriter, linalg_op))) return failure();
-    if (linalg_op.getNumLoops() != 1) return failure();
+    if (linalg_op.getNumOutputs() != 1) return failure();
 
-    // This condition has to be relaxed to support fused inputs.
-    if (linalg_op.getNumInputs() != 1) return failure();
+    // Check if all inputs have a 1D identity map.
+    if (linalg_op.getNumLoops() != 1) return failure();
+    auto indexing_maps = linalg_op.getIndexingMaps();
+    for (auto affine_map : makeArrayRef(indexing_maps).drop_back()) {
+      if (!affine_map.isIdentity()) return failure();
+    }
 
-    mlir::Location loc = linalg_op.getLoc();
+    Location loc = linalg_op.getLoc();
     Value input = linalg_op.getInputOperand(0)->get();
+    // All inputs have the same size because of identity maps for indexing.
+    SmallVector<Value> inputs = linalg_op.inputs();
     Value input_size = rewriter.create<mlir::tensor::DimOp>(loc, input, 0);
 
     auto fill_op = linalg_op.outputs().front().getDefiningOp<FillOp>();
@@ -391,42 +424,28 @@ struct OneDimReductionTilingPattern : public OpRewritePattern<GenericOp> {
     GenericOp tiled_reduction;
     auto tiled_loop_op = rewriter.create<TiledLoopOp>(
         loc, makeArrayRef(zero), makeArrayRef(input_size),
-        makeArrayRef(vector_size_value), makeArrayRef(input),
-        makeArrayRef(new_fill),
+        makeArrayRef(vector_size_value), inputs, makeArrayRef(new_fill),
         rewriter.getStrArrayAttr(mlir::getReductionIteratorTypeName()),
-        [&](OpBuilder &b, mlir::Location nested_loc, ValueRange ivs,
+        [&](OpBuilder &b, Location nested_loc, ValueRange ivs,
             ValueRange inputs, ValueRange outputs) {
-          auto tile_sizes = mlir::linalg::computeTileSizes(
-              b, nested_loc, ivs, vector_size_value, input_size);
-
-          // Extract slice of input.
-          Value slice = mlir::linalg::makeTiledShape(
-              b, nested_loc, inputs[0], vector_size_value,
-              rewriter.getMultiDimIdentityMap(1), ivs[0], input_size,
-              tile_sizes);
-
-          // Pad input tile.
-          Value pad = PadTensorOp::createPadHighOp(
-              RankedTensorType::get({vector_size}, element_type), slice,
-              neutral_value, false, nested_loc, b);
-
-          // Reshape input tile to tensor<1xVECTOR_SIZExELEM_TYPE>.
-          llvm::SmallVector<mlir::ReassociationIndices> indices = {{0, 1}};
-          Value expand_shape = b.create<TensorExpandShapeOp>(
-              nested_loc, RankedTensorType::get({1, vector_size}, element_type),
-              pad, indices);
-
-          // Create `linalg.generic` to reduce
-          // tensor<1xVECTOR_SIZExELEM_TYPE>->tensor<VECTOR_SIZExELEM_TYPE>.
-          mlir::SmallVector<mlir::StringRef, 2> iter_types{
+          SmallVector<Value, 2> reshaped_tiled_inputs =
+              TileAndReshapeInputTensors(b, nested_loc, ivs, inputs,
+                                         neutral_value, input_size,
+                                         vector_size_value);
+          // Create `linalg.generic` to combine
+          // `tensor<1xVECTOR_SIZExELEM_TYPE>1 input with the
+          // `tensor<VECTOR_SIZExELEM_TYPE>` output.
+          SmallVector<mlir::StringRef, 2> iter_types{
               mlir::getReductionIteratorTypeName(),
               mlir::getParallelIteratorTypeName()};
+          SmallVector<mlir::AffineMap, 2> indexing_maps(
+              inputs.size(), rewriter.getMultiDimIdentityMap(2));
+          indexing_maps.push_back(
+              mlir::AffineMap::get(2, 0, b.getAffineDimExpr(1)));
           tiled_reduction = b.create<GenericOp>(
-              nested_loc, outputs[0].getType(), makeArrayRef({expand_shape}),
-              makeArrayRef({outputs[0]}),
-              makeArrayRef({b.getMultiDimIdentityMap(2),
-                            mlir::AffineMap::get(2, 0, b.getAffineDimExpr(1))}),
-              iter_types, /*bodyBuild=*/nullptr);
+              nested_loc, outputs[0].getType(), reshaped_tiled_inputs,
+              makeArrayRef({outputs[0]}), indexing_maps, iter_types,
+              /*bodyBuild=*/nullptr);
           mlir::Region &region = tiled_reduction.region();
           OpBuilder::InsertionGuard g(rewriter);
           rewriter.cloneRegionBefore(linalg_op.region(), region, region.end());
@@ -434,9 +453,10 @@ struct OneDimReductionTilingPattern : public OpRewritePattern<GenericOp> {
         });
     // Create `linalg.generic` to reduce
     // tensor<VECTOR_SIZExELEM_TYPE>->tensor<ELEM_TYPE>.
-    BlockAndValueMapping bvm;
-    bvm.map(input, tiled_loop_op.getResult(0));
-    auto final_reduction = rewriter.clone(*linalg_op.getOperation(), bvm);
+    auto final_reduction_or =
+        ReduceVectorIntoOutput(rewriter, linalg_op, tiled_loop_op.getResult(0));
+    if (failed(final_reduction_or)) return failure();
+    auto final_reduction = final_reduction_or.getValue();
     rewriter.replaceOp(linalg_op, final_reduction->getResults());
 
     tiled_loop_op->walk([&](GenericOp op) {
@@ -446,6 +466,69 @@ struct OneDimReductionTilingPattern : public OpRewritePattern<GenericOp> {
     return success();
   }
 
+  // Tiles, pads and reshapes every input argument of type tensor<?xELEM_TYPE>
+  // into tensor<1xVECTOR_SIZExELEM_TYPE>.
+  SmallVector<Value, 2> TileAndReshapeInputTensors(
+      OpBuilder &b, Location nested_loc, ValueRange ivs, ValueRange inputs,
+      Value neutral_value, Value input_size, Value vector_size_value) const {
+    SmallVector<Value, 2> reshaped_tiled_inputs;
+
+    SmallVector<mlir::ReassociationIndices> indices = {{0, 1}};
+    auto identity_1d_map = b.getMultiDimIdentityMap(1);
+    auto iv = ivs.front();
+
+    auto tile_sizes = mlir::linalg::computeTileSizes(
+        b, nested_loc, ivs, vector_size_value, input_size);
+    for (auto input : inputs) {
+      // Extract slice of input.
+      Value slice = mlir::linalg::makeTiledShape(
+          b, nested_loc, input, vector_size_value, identity_1d_map, iv,
+          input_size, tile_sizes);
+      auto element_type = slice.getType().cast<ShapedType>().getElementType();
+
+      // Pad input tile.
+      Value pad = PadTensorOp::createPadHighOp(
+          RankedTensorType::get({vector_size}, element_type), slice,
+          neutral_value, false, nested_loc, b);
+
+      // Reshape input tile to tensor<1xVECTOR_SIZExELEM_TYPE>.
+      Value expand_shape = b.create<TensorExpandShapeOp>(
+          nested_loc, RankedTensorType::get({1, vector_size}, element_type),
+          pad, indices);
+      reshaped_tiled_inputs.push_back(expand_shape);
+    }
+    return reshaped_tiled_inputs;
+  }
+
+  // Creates `linalg.generic` to reduce
+  // tensor<VECTOR_SIZExELEM_TYPE>->tensor<ELEM_TYPE>. To perform that we match
+  // the combiner in the original "untiled" linalg_op.
+  FailureOr<GenericOp> ReduceVectorIntoOutput(PatternRewriter &rewriter,
+                                              LinalgOp linalg_op,
+                                              Value partial_result) const {
+    SmallVector<mlir::StringRef, 3> reduction_iter_type(
+        1, mlir::getReductionIteratorTypeName());
+    auto map = mlir::AffineMap::get(1, 0, llvm::None, rewriter.getContext());
+
+    auto combiner_or = DetectCombiner(linalg_op);
+    if (failed(combiner_or)) return failure();
+    Operation *combiner = combiner_or.getValue();
+
+    auto accumulator = rewriter.create<GenericOp>(
+        linalg_op.getLoc(), linalg_op->getResultTypes(),
+        makeArrayRef(partial_result),
+        makeArrayRef(linalg_op.getOutputOperand(0)->get()),
+        makeArrayRef({rewriter.getMultiDimIdentityMap(1), map}),
+        reduction_iter_type,
+        [&](OpBuilder &b, Location nested_loc, ValueRange args) {
+          BlockAndValueMapping bvm;
+          bvm.map(combiner->getOperands(), args);
+          Value result_val = b.clone(*combiner, bvm)->getResult(0);
+          b.create<YieldOp>(nested_loc, result_val);
+        });
+    return accumulator;
+  }
+
  private:
   LinalgTransformationFilter filter;
   int64_t vector_size;
@@ -456,8 +539,7 @@ bool isCanonicalizedReduction(Operation *op) {
   auto reduction = mlir::dyn_cast<GenericOp>(op);
   if (!reduction) return false;
 
-  if (reduction.getNumOutputs() != 1 || reduction.getNumLoops() > 2)
-    return false;
+  if (reduction.getNumLoops() > 2) return false;
   return reduction.getNumReductionLoops() == 1;
 }
 
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_math_approximation.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_math_approximation.cc
index f6d6a541d430ef..02259dcb83944d 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_math_approximation.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_math_approximation.cc
@@ -189,7 +189,7 @@ struct EigenExpApproximation : public OpRewritePattern<math::ExpOp> {
 
 LogicalResult EigenExpApproximation::matchAndRewrite(
     math::ExpOp op, PatternRewriter &rewriter) const {
-  auto shape = vectorShape(op.operand().getType(), isF32);
+  auto shape = vectorShape(op.getOperand().getType(), isF32);
   if (!shape.hasValue())
     return rewriter.notifyMatchFailure(op, "unsupported operand type");
   ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
@@ -221,7 +221,7 @@ LogicalResult EigenExpApproximation::matchAndRewrite(
   Value cstCephesExpP4 = bcast(f32Cst(builder, 1.6666665459E-1f));
   Value cstCephesExpP5 = bcast(f32Cst(builder, 5.0000001201E-1f));
 
-  Value x = clamp(builder, op.operand(), cstExpLo, cstExpHi);
+  Value x = clamp(builder, op.getOperand(), cstExpLo, cstExpHi);
   Value m = floor(fma(x, cstCephesLog2E, cstHalf));
 
   Value cstCephesExpC1 = bcast(f32Cst(builder, -0.693359375f));
@@ -239,7 +239,7 @@ LogicalResult EigenExpApproximation::matchAndRewrite(
   y1 = fma(y1, r, cstCephesExpP5);
   y = fma(y, r3, y1);
   y = fma(y, r2, y2);
-  Value ret = max(builder, ldexp(builder, y, m), op.operand());
+  Value ret = max(builder, ldexp(builder, y, m), op.getOperand());
   rewriter.replaceOp(op, ret);
   return mlir::success();
 }
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_matmul_specialization.cc b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_matmul_specialization.cc
deleted file mode 100644
index 12e20f9f34cd85..00000000000000
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_matmul_specialization.cc
+++ /dev/null
@@ -1,211 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_passes.h"
-
-namespace tensorflow {
-namespace {
-
-#define GEN_PASS_CLASSES
-#include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_passes.h.inc"
-
-// Convert 2D memref into a 0D memref (scalar).
-mlir::Value MemrefToScalar(mlir::OpBuilder& builder, mlir::Location loc,
-                           mlir::Value memref) {
-  auto memref_type = memref.getType().cast<mlir::MemRefType>();
-  auto scalar_type = mlir::MemRefType::get({}, memref_type.getElementType());
-
-  std::array<int64_t, 0> empty;
-  return builder.create<mlir::memref::ReinterpretCastOp>(
-      loc, scalar_type, memref, /*offset=*/0,
-      /*sizes=*/empty, /*strides=*/empty);
-}
-
-// Convert 2D memref into a 1D memref (vector).
-mlir::Value MemrefToVector(mlir::OpBuilder& builder, mlir::Location loc,
-                           mlir::Value memref, mlir::Value size,
-                           int64_t static_size) {
-  assert(static_size >= 0 || static_size == mlir::ShapedType::kDynamicSize);
-  auto memref_type = memref.getType().cast<mlir::MemRefType>();
-  auto vec_type =
-      mlir::MemRefType::get({static_size}, memref_type.getElementType());
-
-  auto static_offsets = builder.getI64ArrayAttr({0});
-  auto static_sizes = builder.getI64ArrayAttr({static_size});
-  auto static_strided = builder.getI64ArrayAttr({1});
-
-  auto empty = mlir::ValueRange();
-  auto sizes = static_size == mlir::ShapedType::kDynamicSize
-                   ? mlir::ValueRange(size)
-                   : mlir::ValueRange();
-
-  return builder.create<mlir::memref::ReinterpretCastOp>(
-      loc, vec_type, memref, /*offsets=*/empty,
-      /*sizes=*/sizes, /*strides=*/empty, static_offsets, static_sizes,
-      static_strided);
-}
-
-struct LinalgMatmulSpecializationPattern
-    : public mlir::OpRewritePattern<mlir::linalg::MatmulOp> {
-  using OpRewritePattern<mlir::linalg::MatmulOp>::OpRewritePattern;
-  mlir::LogicalResult matchAndRewrite(
-      mlir::linalg::MatmulOp matmul,
-      mlir::PatternRewriter& rewriter) const override;
-};
-mlir::LogicalResult LinalgMatmulSpecializationPattern::matchAndRewrite(
-    mlir::linalg::MatmulOp matmul, mlir::PatternRewriter& rewriter) const {
-  if (matmul->hasAttr("__tf_cpurt_specialized")) {
-    return rewriter.notifyMatchFailure(matmul,
-                                       "operation was already specialized");
-  }
-
-  auto rhs = matmul.getInputOperand(1)->get();
-  auto lhs = matmul.getInputOperand(0)->get();
-  auto out = matmul.getOutputOperand(0)->get();
-
-  // We do not support inputs or outputs that are not contiguous in memory.
-  if (!IsContiguousMemref(lhs) || !IsContiguousMemref(rhs) ||
-      !IsContiguousMemref(out)) {
-    return rewriter.notifyMatchFailure(
-        matmul, "inputs and output must be contiguous memrefs");
-  }
-
-  auto loc = matmul.getLoc();
-
-  // Matmul dimensions: [m, k] x [k, n]
-  mlir::Value m = rewriter.create<mlir::memref::DimOp>(loc, lhs, 0);
-  mlir::Value k = rewriter.create<mlir::memref::DimOp>(loc, lhs, 1);
-  mlir::Value n = rewriter.create<mlir::memref::DimOp>(loc, rhs, 1);
-
-  // Matmul static dimensions if they are known (can be ShapedType::kDynamicSize
-  // if not known statically).
-  int64_t m_static = lhs.getType().cast<mlir::MemRefType>().getDimSize(0);
-  int64_t k_static = lhs.getType().cast<mlir::MemRefType>().getDimSize(1);
-  int64_t n_static = rhs.getType().cast<mlir::MemRefType>().getDimSize(1);
-
-  auto one = rewriter.create<mlir::arith::ConstantOp>(
-      loc, rewriter.getIndexType(), rewriter.getIndexAttr(1));
-  auto m_is_one = rewriter.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::eq, m, one);
-  auto n_is_one = rewriter.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::eq, n, one);
-
-  auto m_not_one = rewriter.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::ne, m, one);
-  auto n_not_one = rewriter.create<mlir::arith::CmpIOp>(
-      loc, mlir::arith::CmpIPredicate::ne, n, one);
-
-  // linalg.dot: n == 1 && m == 1
-  auto is_dot_product =
-      rewriter.create<mlir::arith::AndIOp>(loc, m_is_one, n_is_one);
-  // linalg.vecmat m == 1 && n != 1
-  auto is_vecmat =
-      rewriter.create<mlir::arith::AndIOp>(loc, m_is_one, n_not_one);
-  // linalg.matvec n == 1 && m != 1
-  auto is_matvec =
-      rewriter.create<mlir::arith::AndIOp>(loc, n_is_one, m_not_one);
-
-  // Build a linalg.dot operation casting inputs to vectors.
-  auto dot = [&](mlir::OpBuilder& builder, mlir::Location nestedLoc) {
-    auto lhs_vec = MemrefToVector(builder, nestedLoc, lhs, k, k_static);
-    auto rhs_vec = MemrefToVector(builder, nestedLoc, rhs, k, k_static);
-    auto out_scalar = MemrefToScalar(builder, nestedLoc, out);
-
-    builder.create<mlir::linalg::DotOp>(nestedLoc,
-                                        mlir::ValueRange({lhs_vec, rhs_vec}),
-                                        mlir::ValueRange({out_scalar}));
-    builder.create<mlir::scf::YieldOp>(nestedLoc);
-  };
-
-  // Build a linalg.vecmat operation casting lhs to vector.
-  auto vecmat = [&](mlir::OpBuilder& builder, mlir::Location nestedLoc) {
-    auto lhs_vec = MemrefToVector(builder, nestedLoc, lhs, k, k_static);
-    auto out_vec = MemrefToVector(builder, nestedLoc, out, n, n_static);
-
-    builder.create<mlir::linalg::VecmatOp>(nestedLoc,
-                                           mlir::ValueRange({lhs_vec, rhs}),
-                                           mlir::ValueRange({out_vec}));
-    builder.create<mlir::scf::YieldOp>(nestedLoc);
-  };
-
-  // Build a linalg.matvec operation casting rhs to vector.
-  auto matvec = [&](mlir::OpBuilder& builder, mlir::Location nestedLoc) {
-    auto rhs_vec = MemrefToVector(builder, nestedLoc, rhs, k, k_static);
-    auto out_vec = MemrefToVector(builder, nestedLoc, out, m, m_static);
-
-    builder.create<mlir::linalg::MatvecOp>(nestedLoc,
-                                           mlir::ValueRange({lhs, rhs_vec}),
-                                           mlir::ValueRange({out_vec}));
-    builder.create<mlir::scf::YieldOp>(nestedLoc);
-  };
-
-  // Build a generic linalg.matmul operation when it can't be matched to any of
-  // the specializations.
-  auto generic = [&](mlir::OpBuilder& builder, mlir::Location nestedLoc) {
-    llvm::SmallVector<mlir::Value> inputs = matmul.getInputOperands();
-    llvm::SmallVector<mlir::Value> outputs = matmul.getOutputOperands();
-    auto specialized =
-        builder.create<mlir::linalg::MatmulOp>(nestedLoc, inputs, outputs);
-    specialized->setAttr("__tf_cpurt_specialized", rewriter.getUnitAttr());
-    builder.create<mlir::scf::YieldOp>(nestedLoc);
-  };
-
-  // TODO(ezhulenev): Simplify to scf.switch operation.
-  // if (is_dot_product) ===>>> linalg.dot    ------------------------------- //
-  auto dispatch = rewriter.create<mlir::scf::IfOp>(
-      loc, is_dot_product, dot,
-      [&](mlir::OpBuilder& builder, mlir::Location nestedLoc) {
-        // else if (is_vecmat)  ===>>> linalg.vecmat    --------------------- //
-        rewriter.create<mlir::scf::IfOp>(
-            nestedLoc, is_vecmat, vecmat,
-            [&](mlir::OpBuilder& builder, mlir::Location nestedLoc) {
-              // else if (is_matvec)  ===>>> linalg.matvec    --------------- //
-              // else                 ===>>> linalg.matmul    --------------- //
-              rewriter.create<mlir::scf::IfOp>(nestedLoc, is_matvec, matvec,
-                                               generic);
-              builder.create<mlir::scf::YieldOp>(nestedLoc);
-            });
-        builder.create<mlir::scf::YieldOp>(nestedLoc);
-      });
-
-  rewriter.replaceOp(matmul, dispatch.results());
-  return mlir::success();
-}
-
-// -------------------------------------------------------------------------- //
-// Dispatch linalg.matmul to one of the more specialized operations at runtime.
-// -------------------------------------------------------------------------- //
-struct LinalgMatmulSpecializationPass
-    : public LinalgMatmulSpecializationBase<LinalgMatmulSpecializationPass> {
-  void runOnFunction() override {
-    mlir::FuncOp function = getFunction();
-    mlir::MLIRContext* ctx = function.getContext();
-
-    mlir::RewritePatternSet patterns(ctx);
-    patterns.insert<LinalgMatmulSpecializationPattern>(ctx);
-
-    (void)mlir::applyPatternsAndFoldGreedily(function, std::move(patterns));
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::FunctionPass> CreateLinalgMatmulSpecializationPass() {
-  return std::make_unique<LinalgMatmulSpecializationPass>();
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_passes.h b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_passes.h
index 72b609a5b38e63..b6c4f00917b908 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_passes.h
+++ b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_passes.h
@@ -35,9 +35,6 @@ std::unique_ptr<mlir::FunctionPass> CreateLinalgTrivialBufferForwardingPass();
 // Pass for trivial copy removal of linalg.copy operations.
 std::unique_ptr<mlir::FunctionPass> CreateLinalgTrivialCopyRemovalPass();
 
-// Pass to tile, promote and vectorize linalg.matmul on buffers.
-std::unique_ptr<mlir::FunctionPass> CreateCodegenStrategyForMatMulPass();
-
 // Pass to optimize padding in tiled loops by peeling the final loop iteration.
 std::unique_ptr<mlir::FunctionPass> CreatePeelTiledLoopsPass();
 
@@ -62,9 +59,6 @@ std::unique_ptr<mlir::FunctionPass> CreateVectorizeTiledOpsPass();
 // Pass to tile elementwise ops on tensors.
 std::unique_ptr<mlir::FunctionPass> CreateCodegenStrategyForCWisePass();
 
-// Pass to specialize linalg.matmul to dot, matvec or vecmat.
-std::unique_ptr<mlir::FunctionPass> CreateLinalgMatmulSpecializationPass();
-
 // Pass to split _Fused Tensorflow kernels into primitives.
 std::unique_ptr<mlir::FunctionPass> CreateFissionPass();
 
diff --git a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_passes.td b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_passes.td
index 093bcbf955a369..bb995f13bac27b 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_passes.td
+++ b/tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_passes.td
@@ -52,24 +52,6 @@ def LinalgTrivialCopyRemoval
   }];
 }
 
-def LinalgMatmulSpecialization
-    : FunctionPass<"tf-cpurt-linalg-matmul-specialization"> {
-  let summary = "Specialize linalg.matmul to dot, matvec or vecmat at runtime";
-  let constructor = "tensorflow::CreateLinalgMatmulSpecializationPass()";
-  let dependentDialects = [
-    "mlir::linalg::LinalgDialect",
-    "mlir::memref::MemRefDialect",
-    "mlir::scf::SCFDialect"
-  ];
-
-  let description = [{
-    Specialize linalg.matmul at runtime to:
-      1. linalg.dot    for vector x vector multiplication
-      2. linalg.matvec for matrix x vector multiplication
-      3. linalg.vecmat for vector x matrix multiplication
-  }];
-}
-
 def CodegenCWise : FunctionPass<"tf-cpurt-codegen-cwise"> {
   let summary = "Tile elementwise ops on tensors";
   let constructor = "tensorflow::CreateCodegenStrategyForCWisePass()";
@@ -86,14 +68,6 @@ def PeelTiledLoops : FunctionPass<"tf-cpurt-peel-tiled-loops"> {
   ];
 }
 
-def CodegenMatmul : FunctionPass<"tf-cpurt-codegen-matmul"> {
-  let summary = "Tile-promote-vectorize linalg.matmul on buffers";
-  let constructor = "tensorflow::CreateCodegenStrategyForMatMulPass()";
-  let dependentDialects = [
-    "mlir::linalg::LinalgDialect",
-  ];
-}
-
 def CodegenReduction : FunctionPass<"tf-cpurt-codegen-reduction"> {
   let summary = "Tile and fuse linalg.generic reduction on tensors.";
   let constructor = "tensorflow::CreateCodegenStrategyForReductionPass()";
diff --git a/tensorflow/compiler/mlir/tfrt/python_tests/regression_tests/floor_div_of_prod_and_oneslike.mlir b/tensorflow/compiler/mlir/tfrt/python_tests/regression_tests/floor_div_of_prod_and_oneslike.mlir
new file mode 100644
index 00000000000000..0a83e6282a79b7
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/python_tests/regression_tests/floor_div_of_prod_and_oneslike.mlir
@@ -0,0 +1,8 @@
+builtin.func @test(%V__0 : tensor<?x?x?xi64> { python_test_attrs.static_type = tensor<34x84x49xi64> }) -> tensor<?x?xi64> {
+  %0 = "tf.OnesLike"(%V__0) : (tensor<?x?x?xi64>) -> tensor<?x?x?xi64>
+  %dims1 = "tf.Const"() { value = dense<[2]> : tensor<1xi32> } : () -> tensor<1xi32>
+  %1 = "tf.Prod"(%0, %dims1) { keep_dims = false } : (tensor<?x?x?xi64>, tensor<1xi32>) -> tensor<?x?xi64>
+  %2 = "tf.OnesLike"(%1) : (tensor<?x?xi64>) -> tensor<?x?xi64>
+  %3 = "tf.FloorDiv"(%1, %2) : (tensor<?x?xi64>, tensor<?x?xi64>) -> tensor<?x?xi64>
+  return %3 : tensor<?x?xi64>
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/fuse_tpu_compile_and_execute_ops.mlir b/tensorflow/compiler/mlir/tfrt/tests/fuse_tpu_compile_and_execute_ops.mlir
index a41f739d91cf24..9b8aa33afd70fd 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/fuse_tpu_compile_and_execute_ops.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/fuse_tpu_compile_and_execute_ops.mlir
@@ -69,18 +69,18 @@ func private @test_fuse_dynamic_dimension_ops(%arg0: tensor<*xi32>, %arg1: tenso
   // CHECK: [[read_result:%.*]] = "tf.ReadVariableOp"(%arg1)
   // CHECK: [[shape_result_1:%.*]] = "tf.Shape"(%arg0) {device = "/CPU:0"} : (tensor<*xi32>) -> tensor<?xi64>
   // CHECK: [[shape_result_2:%.*]] = "tf.Shape"([[read_result]]) {device = "/CPU:0"} : (tensor<*xi32>) -> tensor<?xi64>
-  // CHECK: [[key:%.*]], [[exec_result:%.*]] = "tf.TPUCompileMlirAndExecute"(%arg0, [[shape_result_2]], %0, [[shape_result_1]], %arg2, %arg4, %arg3) {metadata = "metadata", mlir_module = "mlir_module", operand_segment_sizes = dense<[4, 3]> : vector<2xi32>, operands_with_static_shape = [0 : i32, 1 : i32, 3 : i32]} : (tensor<*xi32>, tensor<?xi64>, tensor<*xi32>, tensor<?xi64>, tensor<*xi32>, tensor<*xi32>, tensor<*xi32>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
+  // CHECK: [[key:%.*]], [[exec_result:%.*]] = "tf.TPUCompileMlirAndExecute"(%arg0, [[shape_result_2]], %0, %0, %arg2, %arg4, %arg3) {metadata = "metadata", mlir_module = "mlir_module", operand_segment_sizes = dense<[4, 3]> : vector<2xi32>, operands_with_static_shape = [0 : i32, 1 : i32, 3 : i32]} : (tensor<*xi32>, tensor<?xi64>, tensor<*xi32>, tensor<*xi32>, tensor<*xi32>, tensor<*xi32>, tensor<*xi32>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
   // CHECK: [[key_1:%.*]], [[exec_result_1:%.*]] = "tf.TPUCompileMlirAndExecute"(%arg0, %2, %0, %1) {metadata = "metadata", mlir_module = "mlir_module", operand_segment_sizes = dense<[4, 0]> : vector<2xi32>, operands_with_static_shape = []} : (tensor<*xi32>, tensor<?xi64>, tensor<*xi32>, tensor<?xi64>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
   // CHECK-NEXT: return [[exec_result]] : tensor<*xi32>
-  %arg0_dyn = "tf.SetStaticDimensionBounds" (%arg0, %arg2) :(tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
   %0 = "tf.ReadVariableOp"(%arg1) {device = "/CPU:0"} : (tensor<*x!tf_type.resource>) -> tensor<*xi32>
-  %1 = "tf.Shape"(%arg0) {device = "/CPU:0"} : (tensor<*xi32>) -> tensor<?xi64>
-  %dyn_1 = "tf.SetStaticDimensionBounds" (%1, %arg3) :(tensor<?xi64>, tensor<*xi32>) -> tensor<?xi64>
+  %dyn_arg0 = "tf.SetStaticDimensionBounds" (%arg0, %arg2) :(tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+  %dyn_0 = "tf.SetStaticDimensionBounds" (%0, %arg3) :(tensor<*xi32>, tensor<*xi32>) -> tensor<?xi64>
+  %1 = "tf.Shape"(%dyn_arg0) {device = "/CPU:0"} : (tensor<*xi32>) -> tensor<?xi64>
   %2 = "tf.Shape"(%0) {device = "/CPU:0"} : (tensor<*xi32>) -> tensor<?xi64>
   %dyn_2 = "tf.SetStaticDimensionBounds" (%2, %arg4) :(tensor<?xi64>, tensor<*xi32>) -> tensor<?xi64>
   %compilation_status, %program = "tf._TPUCompileMlir"(%1, %2) {device = "/CPU:0", metadata = "metadata", mlir_module = "mlir_module"} : (tensor<?xi64>, tensor<?xi64>) -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
   "tf.TPUCompileSucceededAssert"(%compilation_status) {device = "/CPU:0"} : (tensor<!tf_type.string>) -> ()
-  %3 = "tf.TPUExecute"(%arg0_dyn, %dyn_2, %0, %dyn_1, %program) {device = "/TPU:0"} : (tensor<*xi32>, tensor<?xi64>, tensor<*xi32>, tensor<?xi64>, tensor<3x!tf_type.string>) -> tensor<*xi32>
+  %3 = "tf.TPUExecute"(%dyn_arg0, %dyn_2, %0, %dyn_0, %program) {device = "/TPU:0"} : (tensor<*xi32>, tensor<?xi64>, tensor<*xi32>, tensor<?xi64>, tensor<3x!tf_type.string>) -> tensor<*xi32>
   %compilation_status_2, %program_2 = "tf._TPUCompileMlir"(%1, %2) {device = "/CPU:0", metadata = "metadata", mlir_module = "mlir_module"} : (tensor<?xi64>, tensor<?xi64>) -> (tensor<!tf_type.string>, tensor<3x!tf_type.string>)
   "tf.TPUCompileSucceededAssert"(%compilation_status) {device = "/CPU:0"} : (tensor<!tf_type.string>) -> ()
   %4 = "tf.TPUExecute"(%arg0, %2, %0, %1, %program_2) {device = "/TPU:0"} : (tensor<*xi32>, tensor<?xi64>, tensor<*xi32>, tensor<?xi64>, tensor<3x!tf_type.string>) -> tensor<*xi32>
diff --git a/tensorflow/compiler/mlir/tfrt/tests/jit/matmul_specialization.mlir b/tensorflow/compiler/mlir/tfrt/tests/jit/matmul_specialization.mlir
deleted file mode 100644
index 48835bb10963c5..00000000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/jit/matmul_specialization.mlir
+++ /dev/null
@@ -1,90 +0,0 @@
-// RUN: tf-tfrt-opt %s -tf-cpurt-linalg-matmul-specialization | FileCheck %s
-
-// CHECK-LABEL: @matmul_dynamic
-func @matmul_dynamic(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>,
-                     %arg2: memref<?x?xf32>) {
-  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-
-  // CHECK: %[[M:.*]] = memref.dim %arg0, %[[C0]]
-  // CHECK: %[[K:.*]] = memref.dim %arg0, %[[C1]]
-  // CHECK: %[[N:.*]] = memref.dim %arg1, %[[C1]]
-
-  // CHECK: %[[M_ONE:.*]] = arith.cmpi eq, %[[M]], %[[C1]]
-  // CHECK: %[[N_ONE:.*]] = arith.cmpi eq, %[[N]], %[[C1]]
-  // CHECK: %[[M_NOT_ONE:.*]] = arith.cmpi ne, %[[M]], %[[C1]]
-  // CHECK: %[[N_NOT_ONE:.*]] = arith.cmpi ne, %[[N]], %[[C1]]
-
-  // CHECK: %[[IS_DOT:.*]] = arith.andi %[[M_ONE]], %[[N_ONE]]
-  // CHECK: %[[IS_VECMAT:.*]] = arith.andi %[[M_ONE]], %[[N_NOT_ONE]]
-  // CHECK: %[[IS_MATVEC:.*]] = arith.andi %[[N_ONE]], %[[M_NOT_ONE]]
-
-  // CHECK: scf.if %[[IS_DOT]] {
-  // CHECK: memref.reinterpret_cast %arg0 {{.*}} to memref<?xf32>
-  // CHECK: memref.reinterpret_cast %arg1 {{.*}} to memref<?xf32>
-  // CHECK: memref.reinterpret_cast %arg2 {{.*}} to memref<f32>
-  // CHECK: linalg.dot
-  // CHECK: } else
-
-  // CHECK: scf.if %[[IS_VECMAT]] {
-  // CHECK: memref.reinterpret_cast %arg0 {{.*}} to memref<?xf32>
-  // CHECK: memref.reinterpret_cast %arg2 {{.*}} to memref<?xf32>
-  // CHECK: linalg.vecmat
-  // CHECK: } else
-
-  // CHECK: scf.if %[[IS_MATVEC]] {
-  // CHECK: memref.reinterpret_cast %arg1 {{.*}} to memref<?xf32>
-  // CHECK: memref.reinterpret_cast %arg2 {{.*}} to memref<?xf32>
-  // CHECK: linalg.matvec
-  // CHECK: } else
-
-  // CHECK: linalg.matmul {__tf_cpurt_specialized}
-  // CHECK: }
-  linalg.matmul ins(%arg0, %arg1: memref<?x?xf32>, memref<?x?xf32>)
-                outs(%arg2: memref<?x?xf32>)
-  return
-}
-
-// CHECK-LABEL: @matmul_static_k
-func @matmul_static_k(%arg0: memref<?x4xf32>, %arg1: memref<4x?xf32>,
-                      %arg2: memref<?x?xf32>) {
-  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
-
-  // CHECK: %[[M:.*]] = memref.dim %arg0, %[[C0]]
-  // CHECK: %[[N:.*]] = memref.dim %arg1, %[[C1]]
-
-  // CHECK: %[[M_ONE:.*]] = arith.cmpi eq, %[[M]], %[[C1]]
-  // CHECK: %[[N_ONE:.*]] = arith.cmpi eq, %[[N]], %[[C1]]
-  // CHECK: %[[M_NOT_ONE:.*]] = arith.cmpi ne, %[[M]], %[[C1]]
-  // CHECK: %[[N_NOT_ONE:.*]] = arith.cmpi ne, %[[N]], %[[C1]]
-
-  // CHECK: %[[IS_DOT:.*]] = arith.andi %[[M_ONE]], %[[N_ONE]]
-  // CHECK: %[[IS_VECMAT:.*]] = arith.andi %[[M_ONE]], %[[N_NOT_ONE]]
-  // CHECK: %[[IS_MATVEC:.*]] = arith.andi %[[N_ONE]], %[[M_NOT_ONE]]
-
-  // CHECK: scf.if %[[IS_DOT]] {
-  // CHECK: memref.reinterpret_cast %arg0 {{.*}} to memref<4xf32>
-  // CHECK: memref.reinterpret_cast %arg1 {{.*}} to memref<4xf32>
-  // CHECK: memref.reinterpret_cast %arg2 {{.*}} to memref<f32>
-  // CHECK: linalg.dot
-  // CHECK: } else
-
-  // CHECK: scf.if %[[IS_VECMAT]] {
-  // CHECK: memref.reinterpret_cast %arg0 {{.*}} to memref<4xf32>
-  // CHECK: memref.reinterpret_cast %arg2 {{.*}} to memref<?xf32>
-  // CHECK: linalg.vecmat
-  // CHECK: } else
-
-  // CHECK: scf.if %[[IS_MATVEC]] {
-  // CHECK: memref.reinterpret_cast %arg1 {{.*}} to memref<4xf32>
-  // CHECK: memref.reinterpret_cast %arg2 {{.*}} to memref<?xf32>
-  // CHECK: linalg.matvec
-  // CHECK: } else
-
-  // CHECK: linalg.matmul {__tf_cpurt_specialized}
-  // CHECK: }
-  linalg.matmul ins(%arg0, %arg1: memref<?x4xf32>, memref<4x?xf32>)
-                outs(%arg2: memref<?x?xf32>)
-  return
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/jit/reduction_codegen.mlir b/tensorflow/compiler/mlir/tfrt/tests/jit/reduction_codegen.mlir
index fb5d67f50a6661..e256c790d2d359 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/jit/reduction_codegen.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/jit/reduction_codegen.mlir
@@ -1,61 +1,69 @@
 // RUN: tf-tfrt-opt -tf-cpurt-codegen-reduction %s --split-input-file |\
 // RUN: FileCheck %s
 
-func @reduce_row_sum_2d(%input: tensor<?x?xf32>) -> tensor<?xf32> {
+func @reduce_row_sum_2d(%lhs: tensor<?x?xf32>,
+                        %rhs: tensor<?x?xf32>) -> tensor<?xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
-  %0 = tensor.dim %input, %c0 : tensor<?x?xf32>
+  %0 = tensor.dim %lhs, %c0 : tensor<?x?xf32>
 
   %init = linalg.init_tensor [%0] : tensor<?xf32>
   %fill = linalg.fill(%cst, %init) : f32, tensor<?xf32> -> tensor<?xf32>
-  %sum = linalg.generic {
+  %sum_of_prod = linalg.generic {
     indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                     affine_map<(d0, d1) -> (d0, d1)>,
                      affine_map<(d0, d1) -> (d0)>],
     iterator_types = ["parallel", "reduction"]}
-    ins(%input : tensor<?x?xf32>)
+    ins(%lhs, %rhs : tensor<?x?xf32>, tensor<?x?xf32>)
     outs(%fill : tensor<?xf32>) {
-  ^bb0(%in: f32, %out: f32):
-    %add = arith.addf %in, %out : f32
+  ^bb0(%l: f32, %r: f32, %o: f32):
+    %prod = arith.mulf %l, %r : f32
+    %add = arith.addf %prod, %o : f32
     linalg.yield %add : f32
   } -> tensor<?xf32>
-  return %sum : tensor<?xf32>
+  return %sum_of_prod : tensor<?xf32>
 }
 // CHECK-LABEL: func @reduce_row_sum_2d(
-// CHECK-SAME:    %[[INPUT:.*]]: tensor<?x?xf32>) -> tensor<?xf32>
+// CHECK-SAME:    %[[LHS:.*]]: tensor<?x?xf32>,
+// CHECK-SAME:    %[[RHS:.*]]: tensor<?x?xf32>) -> tensor<?xf32>
 
 // CHECK-DAG:  %[[C0_F32:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-DAG:  %[[C0:.*]] = arith.constant 0 : index
 // CHECK-DAG:  %[[C4:.*]] = arith.constant 4 : index
 // CHECK-DAG:  %[[C1:.*]] = arith.constant 1 : index
 
-// CHECK:      %[[DIM_0:.*]] = tensor.dim %[[INPUT]], %[[C0]] : [[TY_2D:.*]]
+// CHECK:      %[[DIM_0:.*]] = tensor.dim %[[LHS]], %[[C0]] : [[TY_2D:.*]]
 // CHECK:      %[[INIT:.*]] = linalg.init_tensor [%[[DIM_0]]] : [[TY_1D:.*]]
 // CHECK:      %[[CLONE:.*]] = linalg.init_tensor [%[[DIM_0]]] : [[TY_1D:.*]]
 // CHECK:      %[[FILL:.*]] = linalg.fill(%[[C0_F32]], %[[INIT]])
-// CHECK:      %[[DIM_0_:.*]] = tensor.dim %[[INPUT]], %[[C0]] : [[TY_2D]]
-// CHECK:      %[[DIM_1:.*]] = tensor.dim %[[INPUT]], %[[C1]] : [[TY_2D]]
+// CHECK:      %[[DIM_0_:.*]] = tensor.dim %[[LHS]], %[[C0]] : [[TY_2D]]
+// CHECK:      %[[DIM_1:.*]] = tensor.dim %[[LHS]], %[[C1]] : [[TY_2D]]
 
 // CHECK:      linalg.tiled_loop (%[[I:.*]], %[[J:.*]]) = (%[[C0]], %[[C0]])
 // CHECK-SAME:   to (%[[DIM_0_]], %[[DIM_1]]) step (%[[C4]], %[[C4]])
-// CHECK-SAME:   ins (%[[IN_:.*]] = %[[INPUT]]: [[TY_2D]])
+// CHECK-SAME:   ins (%[[LHS_:.*]] = %[[LHS]]: [[TY_2D]],
+// CHECK-SAME:        %[[RHS_:.*]] = %[[RHS]]: [[TY_2D]])
 // CHECK-SAME:   outs (%[[OUT_:.*]] = %[[FILL]]: [[TY_1D]],
 // CHECK-SAME:         %[[CLONE_:.*]] = %[[CLONE]]: [[TY_1D]])
 
-// CHECK:      %[[IN_SUB:.*]] = tensor.extract_slice %[[IN_]][%[[I]], %[[J]]]
+// CHECK:      %[[LHS_SUB:.*]] = tensor.extract_slice %[[LHS_]][%[[I]], %[[J]]]
+// CHECK:      %[[RHS_SUB:.*]] = tensor.extract_slice %[[RHS_]][%[[I]], %[[J]]]
 // CHECK:      %[[OUT_SUB:.*]] = tensor.extract_slice %[[OUT_]][%[[I]]]
 // CHECK:      %[[CLONE_SUB:.*]] = tensor.extract_slice %[[CLONE_]][%[[I]]]
 
 // CHECK:      %[[FILL_SUB:.*]] = linalg.fill(%[[C0_F32]], %[[CLONE_SUB]])
 
-// CHECK:      %[[SUM_SUB:.*]] = linalg.generic
-// CHECK-SAME:   ins(%[[IN_SUB]] : [[TY_2D]])
+// CHECK:      %[[SUM_OF_PROD_SUB:.*]] = linalg.generic
+// CHECK-SAME:   ins(%[[LHS_SUB]], %[[RHS_SUB]] : [[TY_2D]], [[TY_2D]])
 // CHECK-SAME:   outs(%[[FILL_SUB]] : [[TY_1D]])
+// CHECK:          mulf
 // CHECK:          addf
 // CHECK-NEXT:     linalg.yield
 
 // CHECK:      %[[ACC:.*]] = linalg.generic
-// CHECK-SAME:   ins(%[[SUM_SUB]] : [[TY_1D]])
+// CHECK-SAME:   ins(%[[SUM_OF_PROD_SUB]] : [[TY_1D]])
 // CHECK-SAME:   outs(%[[OUT_SUB]] : [[TY_1D]]) {
+// CHECK-NOT:      mulf
 // CHECK:          addf
 // CHECK-NEXT:     linalg.yield
 
@@ -181,57 +189,70 @@ func @abs(%input: tensor<?x?xf32>) -> tensor<?x?xf32> {
 
 // -----
 
-func @reduce_sum_1d(%input: tensor<?xf32>) -> tensor<f32> {
+func @reduce_sum_1d(%lhs: tensor<?xf32>, %rhs: tensor<?xf32>) -> tensor<f32> {
   %cst = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
-  %0 = tensor.dim %input, %c0 : tensor<?xf32>
+  %0 = tensor.dim %lhs, %c0 : tensor<?xf32>
 
   %init = linalg.init_tensor [] : tensor<f32>
   %fill = linalg.fill(%cst, %init) : f32, tensor<f32> -> tensor<f32>
   %sum = linalg.generic {
     indexing_maps = [affine_map<(d0) -> (d0)>,
+                     affine_map<(d0) -> (d0)>,
                      affine_map<(d0) -> ()>],
     iterator_types = ["reduction"]}
-    ins(%input : tensor<?xf32>)
+    ins(%lhs, %rhs : tensor<?xf32>, tensor<?xf32>)
     outs(%fill : tensor<f32>) {
-  ^bb0(%in: f32, %out: f32):
-    %add = arith.addf %in, %out : f32
+  ^bb0(%l: f32, %r: f32, %out: f32):
+    %prod = arith.mulf %l, %r : f32
+    %add = arith.addf %prod, %out : f32
     linalg.yield %add : f32
   } -> tensor<f32>
   return %sum : tensor<f32>
 }
 
 // CHECK-LABEL: func @reduce_sum_1d(
-// CHECK-SAME:    %[[INPUT:.*]]: tensor<?xf32>) -> tensor<f32> {
+// CHECK-SAME:    %[[LHS:.*]]: tensor<?xf32>, %[[RHS:.*]]: tensor<?xf32>)
      // CHECK: %[[C0_F32:.*]] = arith.constant 0.000000e+00 : f32
      // CHECK: %[[C0:.*]] = arith.constant 0 : index
      // CHECK: %[[C8:.*]] = arith.constant 8 : index
 
      // CHECK: %[[INIT:.*]] = linalg.init_tensor [] : tensor<f32>
      // CHECK: %[[FILL:.*]] = linalg.fill(%[[C0_F32]], %[[INIT]])
-     // CHECK: %[[INPUT_SIZE:.*]] = tensor.dim %[[INPUT]], %[[C0]]
+     // CHECK: %[[INPUT_SIZE:.*]] = tensor.dim %[[LHS]], %[[C0]]
 
      // CHECK: %[[TMP_INIT:.*]] = linalg.init_tensor [8] : tensor<8xf32>
      // CHECK: %[[TMP_FILL:.*]] = linalg.fill(%[[C0_F32]], %[[TMP_INIT]])
      // CHECK: %[[TMP_SUM:.*]] = linalg.tiled_loop (%[[I:.*]]) = (%[[C0]])
 // CHECK-SAME:   to (%[[INPUT_SIZE]]) step (%[[C8]])
-// CHECK-SAME:   ins (%[[INPUT_:.*]] = %[[INPUT]]: tensor<?xf32>)
+// CHECK-SAME:   ins (%[[LHS_:.*]] = %[[LHS]]: tensor<?xf32>,
+// CHECK-SAME:        %[[RHS_:.*]] = %[[RHS]]: tensor<?xf32>)
 // CHECK-SAME:   outs (%[[TMP_INIT_:.*]] = %[[TMP_FILL]]: tensor<8xf32>)
 
-     // CHECK: %[[IN_SUB:.*]] = tensor.extract_slice %[[INPUT_]][%[[I]]]
-     // CHECK: %[[PAD:.*]] = linalg.pad_tensor %[[IN_SUB]]
-     // CHECK: %[[RESHAPE:.*]] = linalg.tensor_expand_shape %[[PAD]]
+     // CHECK: %[[LHS_SUB:.*]] = tensor.extract_slice %[[LHS_]][%[[I]]]
+     // CHECK: %[[LHS_PAD:.*]] = linalg.pad_tensor %[[LHS_SUB]]
+     // CHECK: %[[LHS_RESHAPE:.*]] = linalg.tensor_expand_shape %[[LHS_PAD]]
+// CHECK-SAME:   {{\[\[}}0, 1]]
+// CHECK-SAME:   : tensor<8xf32> into tensor<1x8xf32>
+
+     // CHECK: %[[RHS_SUB:.*]] = tensor.extract_slice %[[RHS_]][%[[I]]]
+     // CHECK: %[[RHS_PAD:.*]] = linalg.pad_tensor %[[RHS_SUB]]
+     // CHECK: %[[RHS_RESHAPE:.*]] = linalg.tensor_expand_shape %[[RHS_PAD]]
 // CHECK-SAME:   {{\[\[}}0, 1]]
 // CHECK-SAME:   : tensor<8xf32> into tensor<1x8xf32>
 
-     // CHECK: %[[SUM:.*]] = linalg.generic
-// CHECK-SAME:   ins(%[[RESHAPE]] : tensor<1x8xf32>)
+     // CHECK: %[[SUM_OF_PROD:.*]] = linalg.generic
+// CHECK-SAME:   ins(%[[LHS_RESHAPE]], %[[RHS_RESHAPE]]
+// CHECK-SAME:       tensor<1x8xf32>, tensor<1x8xf32>)
 // CHECK-SAME:   outs(%[[TMP_INIT_]] : tensor<8xf32>) {
-     // CHECK:   ^bb0(%[[A:.*]]: f32, %[[B:.*]]: f32):
-     // CHECK:     %[[ADD:.*]] = arith.addf %[[A]], %[[B]] : f32
+     // CHECK:   ^bb0(%[[L:.*]]: f32, %[[R:.*]]: f32, %[[O:.*]]: f32):
+     // CHECK:     %[[MUL:.*]] = arith.mulf %[[L]], %[[R]] : f32
+     // CHECK:     %[[ADD:.*]] = arith.addf %[[MUL]], %[[O]] : f32
      // CHECK:       linalg.yield %[[ADD]] : f32
      // CHECK:     } -> tensor<8xf32>
-     // CHECK:   linalg.yield %[[SUM]] : tensor<8xf32>
+     // CHECK:   linalg.yield %[[SUM_OF_PROD]] : tensor<8xf32>
      // CHECK: }
      // CHECK: linalg.generic
 // CHECK-SAME: ins(%[[TMP_SUM]] : tensor<8xf32>) outs(%[[FILL]] : tensor<f32>)
+//  CHECK-NOT:  mulf
+//      CHECK:  addf
diff --git a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_cpurt_pipeline.mlir b/tensorflow/compiler/mlir/tfrt/tests/jit/tf_cpurt_pipeline.mlir
index 5a68e2bbfcb218..598386d42d6147 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_cpurt_pipeline.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/jit/tf_cpurt_pipeline.mlir
@@ -251,27 +251,6 @@ func @tf_binary_with_bcast_symbolic_shapes(
 
 // -----
 
-// CHECK-LABEL: @tf_lower_matmul
-// CHECK-SAME: %[[ARG0:.*]]: memref<?x?xf32>,
-// CHECK-SAME: %[[ARG1:.*]]: memref<?x?xf32>
-func @tf_lower_matmul(%arg0: tensor<?x?xf32>,
-                      %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  // CHECK-NOT: linalg.copy
-  // CHECK: %[[DIM_M:.*]] = memref.dim %[[ARG0]], %c0 : memref<?x?xf32>
-  // CHECK: %[[DIM_N:.*]] = memref.dim %[[ARG1]], %c1 : memref<?x?xf32>
-  // CHECK-NOT: linalg.copy
-  // Tiling for register reuse.
-  // CHECK: scf.for %[[M:.*]] = %c0 to %[[DIM_M]] step %c[[MR:[0-9]+]]
-  // CHECK: scf.for %[[N:.*]] = %c0 to %[[DIM_N]] step %c[[NR:[0-9]+]]
-  // Unrolled tile matmul in vector dialect goes here. It is too large to match.
-  // CHECK: scf.yield %[[TILE:.*]] : vector<[[MR]]x[[NR]]xf32>
-  %0 = "tf.MatMul"(%arg0, %arg1) { transpose_a = false, transpose_b = false}
-       : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
-}
-
-// -----
-
 // CHECK-LABEL: @cast_sub
 func @cast_sub(%arg0: tensor<?x32xi16>, %arg1: tensor<?x?x32xf16>)
     -> tensor<?x?x32xf16> {
diff --git a/tensorflow/compiler/mlir/tfrt/tests/jit/tf_cpurt_stream_analysis.mlir b/tensorflow/compiler/mlir/tfrt/tests/jit/tf_cpurt_stream_analysis.mlir
new file mode 100644
index 00000000000000..a6a325726f574a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/jit/tf_cpurt_stream_analysis.mlir
@@ -0,0 +1,68 @@
+// RUN: tf-tfrt-opt -tfrt-print-stream -verify-diagnostics %s
+
+module @rsqrt_m attributes { tfrt.compiled } {
+  func @compute(%arg0: tensor<512xf32>) -> tensor<512xf32> {
+    %0 = "tf.Rsqrt"(%arg0): (tensor<512xf32>) -> tensor<512xf32>
+    return %0 : tensor<512xf32>
+  }
+}
+
+module @add_m attributes { tfrt.compiled } {
+  func @compute(%arg0: tensor<512x512xf32>) -> tensor<512x512xf32> {
+    %0 = "tf.Rsqrt"(%arg0): (tensor<512x512xf32>) -> tensor<512x512xf32>
+    return %0 : tensor<512x512xf32>
+  }
+}
+
+module @fusion_m attributes { tfrt.compiled } {
+  func @compute(%arg0: tensor<?x512xf32>) -> tensor<?x512xf32> {
+    %0 = "tf.Rsqrt"(%arg0): (tensor<?x512xf32>) -> tensor<?x512xf32>
+    %1 = "tf.Rsqrt"(%0): (tensor<?x512xf32>) -> tensor<?x512xf32>
+    %2 = "tf.Rsqrt"(%1): (tensor<?x512xf32>) -> tensor<?x512xf32>
+    %3 = "tf.Rsqrt"(%2): (tensor<?x512xf32>) -> tensor<?x512xf32>
+    %4 = "tf.Rsqrt"(%3): (tensor<?x512xf32>) -> tensor<?x512xf32>
+    return %4 : tensor<?x512xf32>
+  }
+}
+
+// expected-remark@+1 {{stream id: 0, stream cost: 514, parent stream: -1}}
+func @rsqrt(%arg0: !tfrt_fallback.tf_tensor) -> !tfrt_fallback.tf_tensor {
+  // stream 0 cost = 1 (root) + 1 (%arg0) + 512 * log2(2) (cost @rsqrt_m)
+  //               = 514
+  // expected-remark@+1 {{stream id: 0, stream cost: 514, parent stream: -1}}
+  %res = tf_cpurt.fallback.execute @rsqrt_m::@compute (%arg0)
+           device("/device:CPU:0")
+           :  (!tfrt_fallback.tf_tensor)
+           -> (!tfrt_fallback.tf_tensor)
+
+  // expected-remark@+1 {{stream id: 0, stream cost: 514, parent stream: -1}}
+  tfrt.return %res : !tfrt_fallback.tf_tensor
+}
+
+// expected-remark@+1 {{stream id: 0, stream cost: 262146, parent stream: -1}}
+func @add(%arg0: !tfrt_fallback.tf_tensor) -> !tfrt_fallback.tf_tensor {
+  // stream 0 cost = 1 (root) + 1 (%arg0) + 512 * 512 * log2(2) (cost @add_m)
+  //               = 262146
+  // expected-remark@+1 {{stream id: 0, stream cost: 262146, parent stream: -1}}
+  %res = tf_cpurt.fallback.execute @add_m::@compute (%arg0, %arg0)
+           device("/device:CPU:0")
+           :  (!tfrt_fallback.tf_tensor, !tfrt_fallback.tf_tensor)
+           -> (!tfrt_fallback.tf_tensor)
+
+  // expected-remark@+1 {{stream id: 0, stream cost: 262146, parent stream: -1}}
+  tfrt.return %res : !tfrt_fallback.tf_tensor
+}
+
+// expected-remark@+1 {{stream id: 0, stream cost: 1325, parent stream: -1}}
+func @fusion(%arg0: !tfrt_fallback.tf_tensor) -> !tfrt_fallback.tf_tensor {
+  // stream 0 cost = 1 (root) + 1 (%arg0) + 512 * log2(6) (cost @fusion_m)
+  //               = 2 + 512 * 2.58 = 1325
+  // expected-remark@+1 {{stream id: 0, stream cost: 1325, parent stream: -1}}
+  %res = tf_cpurt.fallback.execute @fusion_m::@compute (%arg0)
+           device("/device:CPU:0")
+           :  (!tfrt_fallback.tf_tensor)
+           -> (!tfrt_fallback.tf_tensor)
+
+  // expected-remark@+1 {{stream id: 0, stream cost: 1325, parent stream: -1}}
+  tfrt.return %res : !tfrt_fallback.tf_tensor
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tf-tfrt-opt.cc b/tensorflow/compiler/mlir/tfrt/tf-tfrt-opt.cc
index 261082d9db30e0..def574cd13b022 100644
--- a/tensorflow/compiler/mlir/tfrt/tf-tfrt-opt.cc
+++ b/tensorflow/compiler/mlir/tfrt/tf-tfrt-opt.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tfrt/jit/opdefs/tf_cpurt_ops.h"
 #include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_passes.h"
 #include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_cpurt_test_passes.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
@@ -45,6 +46,7 @@ int main(int argc, char **argv) {
   registry.insert<mlir::shape::ShapeDialect>();
   registry.insert<mlir::mhlo::MhloDialect>();
   registry.insert<mlir::TFL::TensorFlowLiteDialect>();
+  registry.insert<mlir::tf_cpurt::CpuRuntimeDialect>();
   registry.insert<tfrt::fallback::FallbackDialect>();
   registry.insert<tfrt::fallback_async::FallbackAsyncDialect>();
   tensorflow::RegisterTPUDialects(&registry);
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/fuse_tpu_compile_and_execute_ops.cc b/tensorflow/compiler/mlir/tfrt/transforms/fuse_tpu_compile_and_execute_ops.cc
index e1a17ba6b512d8..d6d59251afed9f 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/fuse_tpu_compile_and_execute_ops.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/fuse_tpu_compile_and_execute_ops.cc
@@ -96,24 +96,18 @@ class FuseTpuCompileAndExecutePass
       llvm::SmallVector<mlir::Value> exec_op_args;
       exec_op_args.resize(exec_op.args().size());
 
-      const auto &static_shaped_operands =
+      auto &static_shaped_operands =
           exec_to_static_shaped_operands_map[exec_op];
       for (int i = 0; i < exec_op.args().size(); ++i) {
         auto iter = static_shaped_operands.find(i);
         if (iter != static_shaped_operands.end()) {
           static_shaped_operand_indices_attr.push_back(iter->first);
-          static_shape_tensors.push_back(iter->second->getOperand(1));
-          exec_op_args[i] = iter->second->getOperand(0);
-          // There should be only one user of this op.
-          if (!iter->second->hasOneUse()) {
-            iter->second->emitOpError(
-                "there should be only one user of the "
-                "tf.SetStaticDimensionBounds op");
-            signalPassFailure();
-            return;
-          }
-          iter->second->dropAllDefinedValueUses();
-          iter->second->dropAllReferences();
+          static_shape_tensors.push_back(iter->second.static_shape());
+          exec_op_args[i] = iter->second.input();
+          // The first operand is the input tensor, while the second operand is
+          // the static shape tensor, hence the drop_back here.
+          iter->second->replaceAllUsesWith(
+              mlir::ValueRange({iter->second.input()}));
           iter->second->erase();
         } else {
           exec_op_args[i] = exec_op->getOperand(i);
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu/convolution_pattern.cc b/tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu/convolution_pattern.cc
index a07c011503792a..3d1819a733d5c1 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu/convolution_pattern.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu/convolution_pattern.cc
@@ -184,7 +184,7 @@ Value CreateBuildFusedConvOp(Value input, Value output, Value bias,
                              const xla::gpu::GpuConvConfig& config,
                              cudnnBackendDescriptorType_t backend_type,
                              ConversionPatternRewriter& rewriter) {
-  se::dnn::BatchDescriptor bias_descriptor;
+  se::dnn::BatchDescriptor bias_descriptor(config.output_descriptor.ndims());
   bias_descriptor.set_count(1)
       .set_height(1)
       .set_width(1)
@@ -210,6 +210,9 @@ Value CreateBuildFusedConvOp(Value input, Value output, Value bias,
             return layout;
         }
       }());
+  if (bias_descriptor.ndims() == 3) {
+    bias_descriptor.set_spatial_dim(se::dnn::DimIndex::Z, 1);
+  }
 
   auto get_element_type = [](Value value) {
     return value.getType().cast<mlir::MemRefType>().getElementType();
@@ -312,10 +315,10 @@ Value CreateBuildConvOp(lmhlo_gpu::ConvForwardOp op, Value handle,
 }
 Value CreateRunConvolutionOp(lmhlo_gpu::ConvForwardOpAdaptor adaptor,
                              mlir::Location loc, Value handle, Value conv_plan,
-                             ConversionPatternRewriter& rewriter) {
+                             Value chain, ConversionPatternRewriter& rewriter) {
   return rewriter.create<tfrt::gpu::DnnRunConvolutionOp>(
       loc, handle, conv_plan, adaptor.input(), adaptor.output(),
-      adaptor.filter(), adaptor.scratch());
+      adaptor.filter(), adaptor.scratch(), chain);
 }
 
 // Specialization for convolution backward input
@@ -334,10 +337,10 @@ Value CreateBuildConvOp(lmhlo_gpu::ConvBackwardInputOp op, Value handle,
 }
 Value CreateRunConvolutionOp(lmhlo_gpu::ConvBackwardInputOpAdaptor adaptor,
                              mlir::Location loc, Value handle, Value conv_plan,
-                             ConversionPatternRewriter& rewriter) {
+                             Value chain, ConversionPatternRewriter& rewriter) {
   return rewriter.create<tfrt::gpu::DnnRunConvolutionOp>(
       loc, handle, conv_plan, adaptor.d_input(), adaptor.d_output(),
-      adaptor.filter(), adaptor.scratch());
+      adaptor.filter(), adaptor.scratch(), chain);
 }
 
 // Specialization for convolution backward filter
@@ -356,10 +359,10 @@ Value CreateBuildConvOp(lmhlo_gpu::ConvBackwardFilterOp op, Value handle,
 }
 Value CreateRunConvolutionOp(lmhlo_gpu::ConvBackwardFilterOpAdaptor adaptor,
                              mlir::Location loc, Value handle, Value conv_plan,
-                             ConversionPatternRewriter& rewriter) {
+                             Value chain, ConversionPatternRewriter& rewriter) {
   return rewriter.create<tfrt::gpu::DnnRunConvolutionOp>(
       loc, handle, conv_plan, adaptor.input(), adaptor.d_output(),
-      adaptor.d_filter(), adaptor.scratch());
+      adaptor.d_filter(), adaptor.scratch(), chain);
 }
 
 // Specialization for convolution forward fused
@@ -386,10 +389,11 @@ Value CreateBuildConvOp(lmhlo_gpu::ConvForwardFusedOp op, Value handle,
 }
 Value CreateRunConvolutionOp(lmhlo_gpu::ConvForwardFusedOpAdaptor adaptor,
                              mlir::Location loc, Value handle, Value conv_plan,
-                             ConversionPatternRewriter& rewriter) {
+                             Value chain, ConversionPatternRewriter& rewriter) {
   return rewriter.create<tfrt::gpu::DnnRunFusedConvolutionOp>(
       loc, handle, conv_plan, adaptor.input(), adaptor.output(),
-      adaptor.filter(), adaptor.output(), adaptor.bias(), adaptor.scratch());
+      adaptor.filter(), adaptor.output(), adaptor.bias(), adaptor.scratch(),
+      chain);
 }
 
 // Specialization for convolution forward fused side input
@@ -420,11 +424,12 @@ Value CreateBuildConvOp(lmhlo_gpu::ConvForwardFusedSideInputOp op, Value handle,
 }
 Value CreateRunConvolutionOp(
     lmhlo_gpu::ConvForwardFusedSideInputOpAdaptor adaptor, mlir::Location loc,
-    Value handle, Value conv_plan, ConversionPatternRewriter& rewriter) {
+    Value handle, Value conv_plan, Value chain,
+    ConversionPatternRewriter& rewriter) {
   return rewriter.create<tfrt::gpu::DnnRunFusedConvolutionOp>(
       loc, handle, conv_plan, adaptor.input(), adaptor.output(),
-      adaptor.filter(), adaptor.side_input(), adaptor.bias(),
-      adaptor.scratch());
+      adaptor.filter(), adaptor.side_input(), adaptor.bias(), adaptor.scratch(),
+      chain);
 }
 
 template <class ConvolutionOpType>
@@ -505,8 +510,8 @@ struct ConvolutionRewritePattern
         op.getLoc(), conv_plan_func.getType().getResults(), handle,
         conv_plan_func.getName());
 
-    Value out_chain = CreateRunConvolutionOp(adaptor, op.getLoc(), handle,
-                                             once_op.getResult(0), rewriter);
+    Value out_chain = CreateRunConvolutionOp(
+        adaptor, op.getLoc(), handle, once_op.getResult(0), chain, rewriter);
     rewriter.eraseOp(op);
     return out_chain;
   }
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu/gemm_pattern.cc b/tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu/gemm_pattern.cc
index fc87c81eae4aa9..da137088676d46 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu/gemm_pattern.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu/gemm_pattern.cc
@@ -66,6 +66,18 @@ Value GetBias(lmhlo_gpu::GEMMOpAdaptor op) { return nullptr; }
 FloatAttr GetBeta(lmhlo_gpu::GEMM_BiasOp op) { return op.betaAttr(); }
 Value GetBias(lmhlo_gpu::GEMM_BiasOpAdaptor op) { return op.bias(); }
 
+// Match GEMM auto-tuning, see ComputationTypeFromPrimitive()
+Type MlirComputationType(Type element_type,
+                         ConversionPatternRewriter& rewriter) {
+  if (element_type.isF16()) {
+    return rewriter.getF32Type();
+  } else if (auto complex_type = element_type.dyn_cast<mlir::ComplexType>()) {
+    return complex_type.getElementType();
+  } else {
+    return element_type;
+  }
+}
+
 // Create all the Ops necessary for the GEMM operation, including the GEMM
 // operation itself.
 template <class GemmOp>
@@ -84,10 +96,7 @@ Value CreateTfrtOps(GemmOp op, typename GemmOp::Adaptor adaptor, Value chain,
 
   auto k_val = lhs_matrix.transpose ? lhs_matrix.num_rows : lhs_matrix.num_cols;
 
-  // Use mixed precision for fp16 to match GEMM auto-tuning, see
-  // ComputationTypeFromPrimitive().
-  Type mlir_compute_type =
-      element_type.isF16() ? rewriter.getF32Type() : element_type;
+  const Type mlir_compute_type = MlirComputationType(element_type, rewriter);
 
   auto m = rewriter.create<tfrt::compiler::ConstantI32Op>(
       loc, output_matrix.num_rows);
@@ -95,7 +104,12 @@ Value CreateTfrtOps(GemmOp op, typename GemmOp::Adaptor adaptor, Value chain,
       loc, output_matrix.num_cols);
   auto k = rewriter.create<tfrt::compiler::ConstantI32Op>(loc, k_val);
 
-  auto const_alpha = MakeScalingFactorConstant(rewriter, loc, mlir_compute_type,
+  // Scale type must match compute type, except for complex types, where
+  // it must match the element type
+  const Type mlir_scale_type =
+      element_type.isa<mlir::ComplexType>() ? element_type : mlir_compute_type;
+
+  auto const_alpha = MakeScalingFactorConstant(rewriter, loc, mlir_scale_type,
                                                alpha_real, alpha_imaginary);
 
   auto lda =
@@ -104,7 +118,7 @@ Value CreateTfrtOps(GemmOp op, typename GemmOp::Adaptor adaptor, Value chain,
       rewriter.create<tfrt::compiler::ConstantI32Op>(loc, rhs_matrix.num_rows);
 
   llvm::APFloat fp_zero = APFloat::getZero(alpha_imaginary.getSemantics());
-  auto const_beta = MakeScalingFactorConstant(rewriter, loc, mlir_compute_type,
+  auto const_beta = MakeScalingFactorConstant(rewriter, loc, mlir_scale_type,
                                               beta_real, fp_zero);
 
   auto ldc = rewriter.create<tfrt::compiler::ConstantI32Op>(
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu/pattern_utils.cc b/tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu/pattern_utils.cc
index 971e8e31e0c987..352e5b45810178 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu/pattern_utils.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/lmhlo_to_gpu/pattern_utils.cc
@@ -32,8 +32,6 @@ cudaDataType_t MlirTypeToCudaDataType(mlir::Type type) {
 }
 
 cublasComputeType_t MlirTypeToCublasComputeType(mlir::Type type) {
-  if (auto complexType = type.dyn_cast<mlir::ComplexType>())
-    return MlirTypeToCublasComputeType(complexType.getElementType());
   if (type.isF16()) return CUBLAS_COMPUTE_16F;
   if (type.isF32()) return CUBLAS_COMPUTE_32F;
   if (type.isF64()) return CUBLAS_COMPUTE_64F;
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index 5424071627b939..31aafe8708b82d 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -66,6 +66,7 @@ cc_library(
         "@llvm-project//mlir:AffineToStandard",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:ArithmeticTransforms",
+        "@llvm-project//mlir:BufferizationTransforms",
         "@llvm-project//mlir:ComplexToStandard",
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:GPUToGPURuntimeTransforms",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
index 6a7af3272dd0dd..93bec92d59fa7c 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"  // from @llvm-project
 #include "mlir/Dialect/Arithmetic/Transforms/Passes.h"  // from @llvm-project
+#include "mlir/Dialect/Bufferization/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/GPUDialect.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/ParallelLoopMapper.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/Passes.h"  // from @llvm-project
@@ -53,7 +54,6 @@ limitations under the License.
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/NVVM/NVVMToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"  // from @llvm-project
-#include "mlir/Transforms/Bufferize.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "mlir/Transforms/LoopUtils.h"  // from @llvm-project
@@ -355,7 +355,8 @@ Status LowerLoopsToGPUorCPU(mlir::ModuleOp module, bool embed_memref_prints,
   pm.addNestedPass<mlir::FuncOp>(mlir::createPromoteBuffersToStackPass(
       [](Value alloc) { return IsSmallAlloc(alloc); }));
   // Free all temporaries,
-  pm.addNestedPass<mlir::FuncOp>(::mlir::createBufferDeallocationPass());
+  pm.addNestedPass<mlir::FuncOp>(
+      ::mlir::bufferization::createBufferDeallocationPass());
   pm.addPass(mlir::createCanonicalizerPass());
 
   // Apply the mapping and go to GPU. We cannot do this earlier due to missing
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
index d40221e275fe16..bf07fa48e87ecf 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
@@ -262,9 +262,11 @@ extern "C" void* _mlir_ciface_tf_jit_compile(
       ctx->op_device_context()->stream()->GetCudaComputeCapability();
   architectures.push_back(absl::StrCat("sm_", cc.major, cc.minor));
 #elif defined(TENSORFLOW_USE_ROCM)
-  architectures.push_back(
-      ctx->op_device_context()->stream()->parent()
-          ->GetDeviceDescription().rocm_amdgpu_gcn_arch_name());
+  architectures.push_back(ctx->op_device_context()
+                              ->stream()
+                              ->parent()
+                              ->GetDeviceDescription()
+                              .rocm_amdgpu_gcn_arch_name());
 #endif
 
   // Construct `SmallVector`s from arguments.
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
index b0b991d5e35a17..2e6a41eff9a2cb 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
@@ -39,6 +39,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir/hlo",
         "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_ops",
+        "@llvm-project//mlir:BufferizationTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:MemRefDialect",
@@ -57,6 +58,7 @@ cc_library(
         "//tensorflow/compiler/mlir/hlo",
         "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_ops",
         "@llvm-project//mlir:BufferizationDialect",
+        "@llvm-project//mlir:BufferizationTransforms",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:LinalgTransforms",
@@ -136,6 +138,7 @@ cc_library(
         "@llvm-project//mlir:ArithmeticTransforms",
         "@llvm-project//mlir:ArithmeticToLLVM",
         "@llvm-project//mlir:MathToLibm",
+        "@llvm-project//mlir:BufferizationTransforms",
         "@llvm-project//mlir:MathToLLVM",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:BufferizationDialect",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc
index e6fda751c25a4d..23f918153d21a5 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // This file implements logic for translating mixed IR to buffer form.
 
-#include "mlir/Transforms/Bufferize.h"  // from @llvm-project
+#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"  // from @llvm-project
 
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
@@ -434,9 +434,9 @@ class BufferizeRankOp : public OpConversionPattern<RankOp> {
 
 }  // namespace
 
-void populateExtraBufferizePatterns(MLIRContext *context,
-                                    BufferizeTypeConverter *converter,
-                                    RewritePatternSet *patterns) {
+void populateExtraBufferizePatterns(
+    MLIRContext *context, bufferization::BufferizeTypeConverter *converter,
+    RewritePatternSet *patterns) {
   // clang-format off
   patterns->insert<
       BufferizeAndConvertMinimumBroadcastShapesOp,
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc
index 0d7a0541c9ef5d..2789bceb281af9 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"  // from @llvm-project
 #include "mlir/Dialect/Arithmetic/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"  // from @llvm-project
+#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"  // from @llvm-project
 #include "mlir/Dialect/Complex/IR/Complex.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/IR/LinalgTypes.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"  // from @llvm-project
@@ -46,7 +47,6 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
-#include "mlir/Transforms/Bufferize.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
@@ -76,7 +76,8 @@ static Value materializeToTensor(OpBuilder& builder, TensorType type,
 }
 
 // TODO(pifon): Remove as soon as https://reviews.llvm.org/D93126 is landed.
-class CustomBufferizeTypeConverter : public BufferizeTypeConverter {
+class CustomBufferizeTypeConverter
+    : public bufferization::BufferizeTypeConverter {
  public:
   CustomBufferizeTypeConverter() {
     // Keep all types unchanged.
@@ -151,7 +152,7 @@ struct ComputeOpAndFuncBufferizePass
     populateReturnOpTypeConversionPattern(patterns, converter);
 
     // Configure legality and structural patterns.
-    populateBufferizeMaterializationLegality(target);
+    bufferization::populateBufferizeMaterializationLegality(target);
     linalg::populateLinalgBufferizePatterns(converter, patterns);
     populateShapeStructuralTypeConversionsAndLegality(converter, patterns,
                                                       target);
@@ -209,7 +210,7 @@ struct TiledLoopBufferizePass
     populateCallOpTypeConversionPattern(patterns, converter);
     populateBranchOpInterfaceTypeConversionPattern(patterns, converter);
     populateReturnOpTypeConversionPattern(patterns, converter);
-    populateBufferizeMaterializationLegality(target);
+    bufferization::populateBufferizeMaterializationLegality(target);
     populateTiledLoopBufferizePattern(&getContext(), &converter, &patterns);
     populateShapeStructuralTypeConversionsAndLegality(converter, patterns,
                                                       target);
@@ -254,7 +255,7 @@ struct FinalBufferizePass : public FinalBufferizePassBase<FinalBufferizePass> {
         tensor::CastOp, tensor::DimOp, chlo::MinimumBroadcastShapesOp,
         bufferization::ToTensorOp, bufferization::ToMemrefOp,
         linalg::TensorExpandShapeOp, linalg::TensorCollapseShapeOp>();
-    BufferizeTypeConverter converter;
+    bufferization::BufferizeTypeConverter converter;
     auto typesAreLegal = [&converter](Operation* op) {
       return converter.isLegal(op->getOperandTypes()) &&
              converter.isLegal(op->getResultTypes());
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_tiled_loop.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_tiled_loop.cc
index 645cba2bdeaceb..c7240bc23f1c76 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_tiled_loop.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_tiled_loop.cc
@@ -16,6 +16,7 @@ limitations under the License.
 // This file implements conversion of `linalg.tiled_loop` to buffer form.
 
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"  // from @llvm-project
+#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
@@ -291,9 +292,9 @@ struct BufferizeVectorTransferWriteOp
 
 }  // namespace
 
-void populateTiledLoopBufferizePattern(MLIRContext *context,
-                                       BufferizeTypeConverter *converter,
-                                       RewritePatternSet *patterns) {
+void populateTiledLoopBufferizePattern(
+    MLIRContext *context, bufferization::BufferizeTypeConverter *converter,
+    RewritePatternSet *patterns) {
   // clang-format off
   patterns->insert<
     BufferizeExtractSliceOp,
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
index bf0dbcf00a613e..912cefcda48d05 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
@@ -172,7 +172,6 @@ class GpuKernelToBlobPass
 
       // Compile PTX code with ptxas if requested and possible and fall back to
       // a compute image, otherwise.
-      bool include_compute_profile = is_compute_profile;
       if (!is_compute_profile) {
         auto gpu_asm = tensorflow::se::CompileGpuAsm(cc_major, cc_minor,
                                                      ptx.c_str(), gpu_asm_opts);
@@ -180,12 +179,18 @@ class GpuKernelToBlobPass
           images.push_back(
               {absl::StrCat("sm_", arch), std::move(gpu_asm.ValueOrDie())});
         } else {
-          LOG(WARNING)
-              << "Failed to compile PTX code, falling back to compute profile.";
-          include_compute_profile = true;
+#ifdef PLATFORM_GOOGLE
+          // Require compilation with ptxas.
+          return gpu_asm;
+#else
+          // Fall back to compilation by driver in OSS.
+          LOG(WARNING) << "Failed to compile generated PTX with ptxas. Falling "
+                          "back to compilation by driver.";
+          is_compute_profile = true;
+#endif
         }
       }
-      if (include_compute_profile) {
+      if (is_compute_profile) {
         std::vector<uint8_t> ptx_bytes;
         ptx_bytes.reserve(ptx.size() + 1);
         std::copy(ptx.begin(), ptx.end(), std::back_inserter(ptx_bytes));
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h
index 762f88cda424d5..c8339cb06388df 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewriters.h
@@ -19,8 +19,9 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 
 namespace mlir {
-
+namespace bufferization {
 class BufferizeTypeConverter;
+}
 class LLVMTypeConverter;
 class MLIRContext;
 class RewritePatternSet;
@@ -44,14 +45,14 @@ namespace transforms {
 
 /// Collects a set of patterns that bufferize operations from the standard and
 /// other dialects.
-void populateExtraBufferizePatterns(MLIRContext *context,
-                                    BufferizeTypeConverter *converter,
-                                    RewritePatternSet *patterns);
+void populateExtraBufferizePatterns(
+    MLIRContext *context, bufferization::BufferizeTypeConverter *converter,
+    RewritePatternSet *patterns);
 
 /// Populate pattern to bufferize `linalg.tiled_loop`.
-void populateTiledLoopBufferizePattern(MLIRContext *context,
-                                       BufferizeTypeConverter *converter,
-                                       RewritePatternSet *patterns);
+void populateTiledLoopBufferizePattern(
+    MLIRContext *context, bufferization::BufferizeTypeConverter *converter,
+    RewritePatternSet *patterns);
 
 /// Populate patterns to rewrite TF operations to TF framework JIT invocations.
 void PopulateTFToJITInvocationPatterns(MLIRContext *ctx,
diff --git a/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir b/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir
index 28f990e4d38511..d49acf736ece43 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir
@@ -10,9 +10,9 @@
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<16xf32>}
 // CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<[3, 0, 1, 2]> : tensor<4xi32>}
 // CHECK-DAG: %[[VAR2:.*]] = "tosa.transpose"(%arg1, %[[VAR1]])
-// CHECK: %[[VAR3:.*]] = "tosa.conv2d"(%arg0, %[[VAR2]], %[[VAR0]]) {dilation = [1, 1], pad = [0, 0, 0, 0], stride = [1, 1]}
-func @test_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<1x1x8x16xf32>) -> tensor<1x32x32x16xf32> {
-  %3 = "tf.Conv2D"(%arg0, %arg1)  {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}  : (tensor<1x32x32x8xf32>, tensor<1x1x8x16xf32>) -> tensor<1x32x32x16xf32>
+// CHECK: %[[VAR3:.*]] = "tosa.conv2d"(%arg0, %[[VAR2]], %[[VAR0]]) {dilation = [1, 1], pad = [0, 1, 0, 1], stride = [1, 1]}
+func @test_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<2x2x8x16xf32>) -> tensor<1x32x32x16xf32> {
+  %3 = "tf.Conv2D"(%arg0, %arg1)  {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}  : (tensor<1x32x32x8xf32>, tensor<2x2x8x16xf32>) -> tensor<1x32x32x16xf32>
   return %3 : tensor<1x32x32x16xf32>
 }
 
diff --git a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
index bcef454ea48e0b..8f1605696561d3 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
@@ -12,10 +12,10 @@
 
 // CHECK-LABEL: test_conv2d
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<0.000000e+00> : tensor<16xf32>}
-// CHECK: %[[VAR1:.*]] = "tosa.conv2d"(%arg0, %arg1, %[[VAR0]]) {dilation = [1, 1], pad = [0, 0, 0, 0], stride = [1, 1]}
-func @test_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>) -> tensor<*xf32> {
+// CHECK: %[[VAR1:.*]] = "tosa.conv2d"(%arg0, %arg1, %[[VAR0]]) {dilation = [1, 1], pad = [0, 1, 0, 1], stride = [1, 1]}
+func @test_conv2d(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<0.000000e+00> : tensor<16xf32>
-  %0 = "tfl.conv_2d"(%arg0, %arg1, %cst)  {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<*xf32>
+  %0 = "tfl.conv_2d"(%arg0, %arg1, %cst)  {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
@@ -33,10 +33,10 @@ func @test_conv2d_dynamic(%arg0: tensor<?x32x32x8xf32>, %arg1: tensor<16x1x1x8xf
 // -----
 
 // CHECK-LABEL: test_conv2d_bias
-// CHECK: %[[VAR0:.*]] = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = [1, 1], pad = [0, 0, 0, 0], stride = [1, 1]}
+// CHECK: %[[VAR0:.*]] = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = [1, 1], pad = [0, 1, 0, 1], stride = [1, 1]}
 // CHECK-SAME: tensor<1x32x32x16xf32>
-func @test_conv2d_bias(%arg0: tensor<1x32x32x8xf32>, %cst: tensor<16x1x1x8xf32>, %cst_0: tensor<16xf32>) -> tensor<*xf32> {
-  %0 = "tfl.conv_2d"(%arg0, %cst, %cst_0)  {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8xf32>, tensor<16x1x1x8xf32>, tensor<16xf32>) -> tensor<*xf32>
+func @test_conv2d_bias(%arg0: tensor<1x32x32x8xf32>, %cst: tensor<16x2x2x8xf32>, %cst_0: tensor<16xf32>) -> tensor<*xf32> {
+  %0 = "tfl.conv_2d"(%arg0, %cst, %cst_0)  {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8xf32>, tensor<16x2x2x8xf32>, tensor<16xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
 
@@ -55,14 +55,14 @@ func @test_transpose_conv2d(%arg0: tensor<1x32x32x8xf32>, %cst_0: tensor<16x1x1x
 // -----
 
 // CHECK-LABEL: test_conv2d_qi8
-// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<{{.*}}> : tensor<16x1x1x8xi8>}
+// CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() {value = dense<{{.*}}> : tensor<16x2x2x8xi8>}
 // CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() {value = dense<0> : tensor<16xi32>}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.conv2d"(%arg0, %[[VAR0]], %[[VAR1]]) {dilation = [1, 1], pad = [0, 0, 0, 0], quantization_info = {input_zp = 0 : i32, weight_zp = 0 : i32}, stride = [1, 1]}
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.conv2d"(%arg0, %[[VAR0]], %[[VAR1]]) {dilation = [1, 1], pad = [0, 1, 0, 1], quantization_info = {input_zp = 0 : i32, weight_zp = 0 : i32}, stride = [1, 1]}
 // CHECK: %[[VAR3:.*]] = "tosa.rescale"(%[[VAR2]])
 func @test_conv2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684768557548523>>) -> tensor<1x32x32x16x!quant.uniform<i8:f32, 0.078431375324726104>> {
-  %0 = "tfl.pseudo_qconst"() {qtype = tensor<16x1x1x8x!quant.uniform<i8<-127:127>:f32:0, {0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1}>>, value = dense<"0x851F811ED39B1160E8BFD11A44C8815EC054BEB7658131420857498B9B7FA28499818C7AB44894E64B81C6C350A581E8042F48DB13B85A81EEE481FD28A43BBBC381A70384A46F47811C2A4D64D8D285DEDCE37F1FFC6B5BB0A3794EED7F98D9060BA5ED5EC6A37F7FF4E67364062F078AE9DDDF778155794C54AE536D7FAC05"> : tensor<16x1x1x8xi8>} : () -> tensor<16x1x1x8x!quant.uniform<i8<-127:127>:f32:0,  {0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1} >>
+  %0 = "tfl.pseudo_qconst"() {qtype = tensor<16x2x2x8x!quant.uniform<i8<-127:127>:f32:0, {0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1}>>, value = dense<42> : tensor<16x2x2x8xi8>} : () -> tensor<16x2x2x8x!quant.uniform<i8<-127:127>:f32:0,  {0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1} >>
   %1 = "tfl.pseudo_qconst"() {qtype = tensor<16x!quant.uniform<i32:f32:0, {2.0,2.0,1.0,1.0,1.0,2.0,2.4,1.7,2.3,2.4,2.4,2.3,2.1,2.4,2.1,2.4}>>, value = dense<0> : tensor<16xi32>} : () -> tensor<16x!quant.uniform<i32:f32:0,  {2.0,2.0,1.0,1.0,1.0,2.0,2.4,1.7,2.3,2.4,2.4,2.3,2.1,2.4,2.1,2.4} >>
-  %2 = "tfl.conv_2d"(%arg0, %0, %1) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684768557548523>>, tensor<16x1x1x8x!quant.uniform<i8<-127:127>:f32:0, {0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1}>>, tensor<16x!quant.uniform<i32:f32:0, {2.0,2.0,1.0,1.0,1.0,2.0,2.4,1.7,2.3,2.4,2.4,2.3,2.1,2.4,2.1,2.4} >>) -> tensor<1x32x32x16x!quant.uniform<i8:f32, 0.078431375324726104>>
+  %2 = "tfl.conv_2d"(%arg0, %0, %1) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684768557548523>>, tensor<16x2x2x8x!quant.uniform<i8<-127:127>:f32:0, {0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1}>>, tensor<16x!quant.uniform<i32:f32:0, {2.0,2.0,1.0,1.0,1.0,2.0,2.4,1.7,2.3,2.4,2.4,2.3,2.1,2.4,2.1,2.4} >>) -> tensor<1x32x32x16x!quant.uniform<i8:f32, 0.078431375324726104>>
   return %2 : tensor<1x32x32x16x!quant.uniform<i8:f32, 0.078431375324726104>>
 }
 
@@ -1493,12 +1493,12 @@ func @test_fullyconnected_hybrid(%arg0: tensor<14x19xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL: @test_conv2d_infer
 // CHECK: -> tensor<*xf32>
-func @test_conv2d_infer(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x1x1x8xf32>) -> tensor<*xf32> {
+func @test_conv2d_infer(%arg0: tensor<1x32x32x8xf32>, %arg1: tensor<16x2x2x8xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<0.000000e+00> : tensor<16xf32>
   // CHECK: tosa.add
   // CHECK: tosa.conv2d
   // CHECK: tensor.cast
-  %0 = "tfl.add"(%arg1, %arg1) { fused_activation_function = "NONE" } : (tensor<16x1x1x8xf32>, tensor<16x1x1x8xf32>) -> tensor<*xf32>
+  %0 = "tfl.add"(%arg1, %arg1) { fused_activation_function = "NONE" } : (tensor<16x2x2x8xf32>, tensor<16x2x2x8xf32>) -> tensor<*xf32>
   %1 = "tfl.conv_2d"(%arg0, %0, %cst)  {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8xf32>, tensor<*xf32>, tensor<16xf32>) -> tensor<*xf32>
   return %1 : tensor<*xf32>
 }
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index e10ebe5fa2c78c..39a28a3818d5a0 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -714,6 +714,28 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
         "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/core/ir/types:Dialect",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:MlirOptLib",
+    ],
+)
+
+tf_cc_binary(
+    name = "xla-opt-gpu",
+    testonly = True,
+    srcs = ["xla_opt_main.cc"],
+    deps = [
+        ":adjust_layout",  # buildcleaner: keep
+        ":mhlo_to_lhlo_with_xla",  # buildcleaner: keep
+        ":tf_xla_passes",  # buildcleaner: keep
+        ":xla_legalize_tf",  # buildcleaner: keep
+        ":xla_legalize_tf_no_fallback",  # buildcleaner: keep
+        ":xla_passes",  # buildcleaner: keep
+        "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir/hlo:all_passes",
+        "//tensorflow/compiler/mlir/hlo:hlo_dialect_registration",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
         "//tensorflow/compiler/xla/service:gpu_plugin",
         "//tensorflow/core/ir/types:Dialect",
         "@llvm-project//mlir:AllPassesAndDialects",
diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
index 2365021bcbba54..3baa5ad786a010 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
@@ -60,6 +60,8 @@ namespace xla {
 
 namespace {
 
+constexpr char kShardingAttr[] = "mhlo.sharding";
+
 // Note: This sanitization function causes an irreversible many-to-one mapping
 // and any solution to mitigate this would cause issues with the reverse
 // direction. Longterm solution is to add a function attribute to maintain the
@@ -248,6 +250,13 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
   mlir::Location loc = GenerateInstructionLocation(instruction, func_builder);
 
   llvm::SmallVector<NamedAttribute, 10> attributes;
+  if (instruction->has_sharding()) {
+    attributes.push_back(builder_->getNamedAttr(
+        kShardingAttr,
+        builder_->getStringAttr(
+            instruction->sharding().ToProto().SerializeAsString())));
+  }
+
   switch (instruction->opcode()) {
     case HloOpcode::kParameter: {
       return nullptr;
@@ -347,6 +356,19 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
     }
     case HloOpcode::kCustomCall: {
       auto custom_call = Cast<HloCustomCallInstruction>(instruction);
+      const auto& called_computations = custom_call->called_computations();
+      if (!called_computations.empty()) {
+        llvm::SmallVector<mlir::Attribute> callees;
+        callees.reserve(called_computations.size());
+        for (HloComputation* callee : called_computations) {
+          TF_ASSIGN_OR_RETURN(FuncOp function, ImportAsFunc(*callee));
+          callees.push_back(mlir::FlatSymbolRefAttr::get(builder_->getContext(),
+                                                         function.getName()));
+        }
+        attributes.push_back(builder_->getNamedAttr(
+            "called_computations",
+            mlir::ArrayAttr::get(builder_->getContext(), callees)));
+      }
       if (custom_call->layout_constrained()) {
         TF_ASSIGN_OR_RETURN(
             mlir::ArrayAttr operand_layouts,
@@ -600,7 +622,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       int num_branches = instruction->branch_count();
       auto op = func_builder->create<mlir::mhlo::CaseOp>(
           loc, rets, operands, attributes, num_branches);
-      for (auto index_and_computation :
+      for (const auto& index_and_computation :
            llvm::enumerate(instruction->branch_computations())) {
         auto index = index_and_computation.index();
         HloComputation* computation = index_and_computation.value();
@@ -624,7 +646,9 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           builder_->getI64IntegerAttr(all_gather->all_gather_dimension())));
       attributes.push_back(
           ConvertReplicaGroups(all_gather->replica_groups(), builder_));
-      attributes.push_back(ConvertChannelHandle(all_gather->channel_id()));
+      if (all_gather->channel_id().has_value())
+        attributes.push_back(
+            ConvertChannelHandle(all_gather->channel_id().value()));
       return func_builder
           ->create<mlir::mhlo::AllGatherOp>(loc, result_type, operands,
                                             attributes)
@@ -634,13 +658,48 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       auto all_reduce = Cast<HloAllReduceInstruction>(instruction);
       attributes.push_back(
           ConvertReplicaGroups(all_reduce->replica_groups(), builder_));
-      attributes.push_back(ConvertChannelHandle(all_reduce->channel_id()));
+      if (all_reduce->channel_id().has_value())
+        attributes.push_back(
+            ConvertChannelHandle(all_reduce->channel_id().value()));
       auto all_reduce_op = func_builder->create<mlir::mhlo::AllReduceOp>(
           loc, result_type, operands, attributes);
       TF_RETURN_IF_ERROR(ImportAsRegion(*all_reduce->to_apply(),
                                         &all_reduce_op.computation()));
       return all_reduce_op.getOperation();
     }
+    case HloOpcode::kAllToAll: {
+      // TODO(b/207152612): all-to-all HLO can either have pre-split operands
+      // (and returns a tuple) or a single operand that is split across
+      // `split_dimension` into the number of replicas in a group. Only the
+      // latter case (array all-to-all) is supported in importer right now and
+      // the former (tuple all-to-all) is not supported yet.
+      auto all_to_all = Cast<HloAllToAllInstruction>(instruction);
+      if (all_to_all->shape().IsTuple())
+        return tensorflow::errors::Unimplemented(
+            "Importing tuple all-to-all HLO is not supported yet");
+
+      // Check invariants of array all-to-all. This is a sanity check and is
+      // verified by the HLO verifier.
+      if (!all_to_all->split_dimension().has_value() || operands.size() != 1 ||
+          all_to_all->replica_groups().empty())
+        return tensorflow::errors::InvalidArgument(
+            "Array all-to-all should have a split dimension, one operand and "
+            "non-empty replica groups");
+
+      auto replica_groups_attr =
+          ConvertReplicaGroups(all_to_all->replica_groups(), builder_)
+              .getValue()
+              .cast<DenseIntElementsAttr>();
+      uint64_t split_dim = all_to_all->split_dimension().value();
+      uint64_t concat_dim = split_dim;
+      uint64_t split_count = all_to_all->replica_groups()[0].replica_ids_size();
+
+      return func_builder
+          ->create<mlir::mhlo::AllToAllOp>(loc, result_type, operands[0],
+                                           split_dim, concat_dim, split_count,
+                                           replica_groups_attr)
+          .getOperation();
+    }
     case HloOpcode::kReduce: {
       // Operands in the first half are reduction inputs and the remaining
       // operands are corresponding initial values.
@@ -758,6 +817,24 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
                                                   attributes)
           .getOperation();
     }
+    case HloOpcode::kReduceScatter: {
+      auto reduce_scatter = Cast<HloReduceScatterInstruction>(instruction);
+      attributes.push_back(builder_->getNamedAttr(
+          "scatter_dimension",
+          builder_->getI64IntegerAttr(reduce_scatter->scatter_dimension())));
+      attributes.push_back(
+          ConvertReplicaGroups(reduce_scatter->replica_groups(), builder_));
+      if (reduce_scatter->channel_id().has_value())
+        attributes.push_back(
+            ConvertChannelHandle(reduce_scatter->channel_id().value()));
+      auto reduce_scatter_op =
+          func_builder->create<mlir::mhlo::ReduceScatterOp>(
+              loc, result_type, operands, attributes);
+      TF_RETURN_IF_ERROR(ImportAsRegion(*reduce_scatter->to_apply(),
+                                        &reduce_scatter_op.computation()));
+
+      return reduce_scatter_op.getOperation();
+    }
     case HloOpcode::kReduceWindow: {
       llvm::SmallVector<Type, 4> return_types = {result_type};
       if (mlir::TupleType tuple_ty = result_type.dyn_cast<mlir::TupleType>()) {
@@ -1114,7 +1191,7 @@ mlir::NamedAttribute HloFunctionImporter::ConvertSourceTargetPairs(
     const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs,
     mlir::Builder* builder) {
   std::vector<int64_t> attr(source_target_pairs.size() * 2);
-  for (auto p : llvm::enumerate(source_target_pairs)) {
+  for (const auto& p : llvm::enumerate(source_target_pairs)) {
     attr[2 * p.index()] = p.value().first;
     attr[2 * p.index() + 1] = p.value().second;
   }
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
index 084ae21fab7336..3fe755f5baf803 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
@@ -139,12 +139,26 @@ StatusOr<XlaOp> MlirHloBuilder::CustomCallInternal(
     const Literal* literal, absl::optional<Window> window,
     absl::optional<ConvolutionDimensionNumbers> dnums,
     CustomCallSchedule schedule, CustomCallApiVersion api_version) {
-  mlir::ArrayAttr operand_layouts;
-  mlir::ArrayAttr result_layouts;
+  TF_RET_CHECK(output_operand_aliasing.empty())
+      << "MLIR CustomCallOp does not support output_operand_aliasing yet";
+  TF_RET_CHECK(literal == nullptr)
+      << "MLIR CustomCallOp does not support literal yet";
+  TF_RET_CHECK(!window.has_value())
+      << "MLIR CustomCallOp does not support ConvolutionDimensionNumbers yet";
+  TF_RET_CHECK(!dnums.has_value())
+      << "MLIR CustomCallOp does not support ConvolutionDimensionNumbers yet";
+  TF_RET_CHECK(schedule == CustomCallSchedule::SCHEDULE_NONE)
+      << "MLIR CustomCallOp does not support custom-call-schedule yet";
+
+  llvm::SmallVector<mlir::NamedAttribute> attributes;
   if (operand_shapes_with_layout.has_value()) {
-    TF_ASSIGN_OR_RETURN(operand_layouts,
+    TF_ASSIGN_OR_RETURN(mlir::ArrayAttr operand_layouts,
                         ExtractLayoutsFromShapes(
                             operand_shapes_with_layout.value(), &builder_));
+    attributes.push_back(
+        builder_.getNamedAttr("operand_layouts", operand_layouts));
+
+    mlir::ArrayAttr result_layouts;
     if (shape.IsTuple()) {
       TF_ASSIGN_OR_RETURN(result_layouts,
                           ExtractLayoutsFromTuple(shape, &builder_));
@@ -152,29 +166,26 @@ StatusOr<XlaOp> MlirHloBuilder::CustomCallInternal(
       TF_ASSIGN_OR_RETURN(result_layouts,
                           ExtractLayoutsFromShapes({shape}, &builder_));
     }
+    attributes.push_back(
+        builder_.getNamedAttr("result_layouts", result_layouts));
   }
   TF_ASSIGN_OR_RETURN(mlir::Type ty, ConvertShapeToType<mlir::RankedTensorType>(
                                          shape, builder_));
   TF_ASSIGN_OR_RETURN(auto mlir_api_version,
                       ConvertCustomCallApiVersion(api_version));
-  TF_RET_CHECK(output_operand_aliasing.empty())
-      << "MLIR CustomCallOp does not support output_operand_aliasing yet";
-  TF_RET_CHECK(literal == nullptr)
-      << "MLIR CustomCallOp does not support literal yet";
-  TF_RET_CHECK(!window.has_value())
-      << "MLIR CustomCallOp does not support ConvolutionDimensionNumbers yet";
-  TF_RET_CHECK(!dnums.has_value())
-      << "MLIR CustomCallOp does not support ConvolutionDimensionNumbers yet";
-  TF_RET_CHECK(schedule == CustomCallSchedule::SCHEDULE_NONE)
-      << "MLIR CustomCallOp does not support custom-call-schedule yet";
+  attributes.push_back(builder_.getNamedAttr(
+      "api_version", mlir::mhlo::CustomCallApiVersionAttr::get(
+                         builder_.getContext(), mlir_api_version)));
+
+  attributes.push_back(builder_.getNamedAttr(
+      "call_target_name", builder_.getStringAttr(call_target_name)));
+  attributes.push_back(builder_.getNamedAttr(
+      "has_side_effect", builder_.getBoolAttr(has_side_effect)));
+  attributes.push_back(
+      builder_.getNamedAttr("backend_config", builder_.getStringAttr(opaque)));
+
   auto op = builder_.create<mlir::mhlo::CustomCallOp>(
-      loc_, ty, GetValues(operands), builder_.getStringAttr(call_target_name),
-      /*has_side_effect=*/builder_.getBoolAttr(has_side_effect),
-      builder_.getStringAttr(opaque),
-      /*api_version=*/
-      mlir::mhlo::CustomCallApiVersionAttr::get(builder_.getContext(),
-                                                mlir_api_version),
-      operand_layouts, result_layouts);
+      loc_, ty, GetValues(operands), attributes);
   return MakeXlaOp(op.getResult(0));
 }
 
@@ -573,9 +584,7 @@ StatusOr<XlaOp> MlirHloBuilder::PadInternal(
   TF_ASSIGN_OR_RETURN(
       mlir::Type result_type,
       ConvertShapeToType<mlir::RankedTensorType>(shape, builder_));
-  std::vector<int64_t> low;
-  std::vector<int64_t> high;
-  std::vector<int64_t> internal;
+  llvm::SmallVector<int64_t> low, high, internal;
   for (auto& dimension : padding_config.dimensions()) {
     low.push_back(dimension.edge_padding_low());
     high.push_back(dimension.edge_padding_high());
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index 535d8b56659b32..ae1ee95854acdc 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -577,6 +577,17 @@ class ConvertToHloModule {
       mlir::CallOp call_op, xla::XlaBuilder* builder,
       ConvertToHloModule::ValueLoweringMap* value_lowering);
 
+  // Look up a symbol with the specified name, returning null if no such name
+  // exists.
+  FuncOp LookUpSymbol(FlatSymbolRefAttr symbol) {
+    return module_.lookupSymbol<mlir::FuncOp>(symbol);
+  }
+
+  // Get Reference to lowered XLA computation for a function.
+  xla::XlaComputation& GetLoweredComputation(FuncOp func) {
+    return lowered_computation_[func];
+  }
+
   LogicalResult Lower(
       mlir::Operation* inst, bool is_entry_function,
       llvm::ArrayRef<absl::optional<xla::OpSharding>> ret_shardings,
@@ -903,32 +914,63 @@ LogicalResult ExportXlaOp(ConvertOp op, OpLoweringContext ctx) {
 LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
   if (op.getNumResults() != 1)
     return op.emitOpError() << "with multiple results cannot be exported";
+
+  if (op.called_computations().size() > 1)
+    return op.emitOpError()
+           << "cannot export with more than one called computations";
+
+  // Custom call can be exported either with called computation or with layout
+  // attributes. The XlaBuilder API does not allow both.
+  if (!op.called_computations().empty() && op.operand_layouts() &&
+      op.result_layouts()) {
+    return op.emitOpError() << "cannot export if both called computation and "
+                               "layouts are specified";
+  }
+
   Value result = op.getResult(0);
   llvm::SmallVector<xla::XlaOp> args;
   if (failed(GetTuple(op, op.args(), ctx, args))) return failure();
   auto xla_api_version = xla::ConvertCustomCallApiVersion(op.api_version());
   if (!xla_api_version.ok()) return failure();
   auto& value_map = *ctx.values;
-  if (!op.operand_layouts().hasValue() || !op.result_layouts().hasValue()) {
-    value_map[result] = xla::CustomCall(
-        ctx.builder, std::string(op.call_target_name()), args,
+
+  if (op.called_computations().size() == 1) {
+    mlir::FuncOp callee = ctx.converter->LookUpSymbol(
+        op.called_computations()[0].cast<FlatSymbolRefAttr>());
+    if (failed(ctx.converter->RunOnFunction(callee))) return failure();
+    xla::XlaComputation& computation =
+        ctx.converter->GetLoweredComputation(callee);
+    value_map[result] = xla::CustomCallWithComputation(
+        ctx.builder, std::string(op.call_target_name()), args, computation,
         xla::TypeToShape(result.getType()), std::string(op.backend_config()),
-        op.has_side_effect(), /*output_operand_aliasing=*/{},
+        op.has_side_effect(),
+        /*output_operand_aliasing=*/{},
+        /*literal=*/nullptr,
+        /*schedule=*/xla::CustomCallSchedule::SCHEDULE_NONE,
+        /*api_version=*/*xla_api_version);
+    return success();
+  }
+
+  if (op.operand_layouts() && op.result_layouts()) {
+    auto operand_shapes_with_layout = ConvertTypesToShapesWithLayout(
+        op.getOperandTypes(), op.operand_layouts().getValue());
+    xla::Shape result_shape_with_layout = GetCustomCallResultShapeWithLayout(
+        result.getType(), op.result_layouts().getValue());
+    value_map[result] = xla::CustomCallWithLayout(
+        ctx.builder, std::string(op.call_target_name()), args,
+        result_shape_with_layout, operand_shapes_with_layout,
+        std::string(op.backend_config()), op.has_side_effect(),
+        /*output_operand_aliasing=*/{},
         /*literal=*/nullptr,
         /*schedule=*/xla::CustomCallSchedule::SCHEDULE_NONE,
         /*api_version=*/*xla_api_version);
     return success();
   }
 
-  auto operand_shapes_with_layout = ConvertTypesToShapesWithLayout(
-      op.getOperandTypes(), op.operand_layouts().getValue());
-  xla::Shape result_shape_with_layout = GetCustomCallResultShapeWithLayout(
-      result.getType(), op.result_layouts().getValue());
-  value_map[result] = xla::CustomCallWithLayout(
+  value_map[result] = xla::CustomCall(
       ctx.builder, std::string(op.call_target_name()), args,
-      result_shape_with_layout, operand_shapes_with_layout,
-      std::string(op.backend_config()), op.has_side_effect(),
-      /*output_operand_aliasing=*/{},
+      xla::TypeToShape(result.getType()), std::string(op.backend_config()),
+      op.has_side_effect(), /*output_operand_aliasing=*/{},
       /*literal=*/nullptr,
       /*schedule=*/xla::CustomCallSchedule::SCHEDULE_NONE,
       /*api_version=*/*xla_api_version);
@@ -1582,10 +1624,9 @@ LogicalResult ConvertToHloModule::Lower(
     // Construct the return value for the function. If there is a single value
     // returned, then return it directly, else create a tuple and return.
     unsigned num_return_values = inst->getNumOperands();
+    const bool has_ret_shardings =
+        !ret_shardings.empty() && AllOptionalShardingsAreSet(ret_shardings);
     if ((return_tuple_ && is_entry_function) || num_return_values != 1) {
-      const bool has_ret_shardings =
-          !ret_shardings.empty() && AllOptionalShardingsAreSet(ret_shardings);
-
       std::vector<xla::XlaOp> returns(num_return_values);
       for (OpOperand& ret : inst->getOpOperands()) {
         unsigned index = ret.getOperandNumber();
@@ -1623,7 +1664,14 @@ LogicalResult ConvertToHloModule::Lower(
       if (failed(GetXlaOp(inst->getOperand(0), value_map, &operand, inst)))
         return failure();
 
-      *return_value = operand;
+      if (has_ret_shardings) {
+        auto tuple = Tuple(builder, {operand});
+        builder->SetSharding(*ret_shardings[0]);
+        *return_value = GetTupleElement(tuple, 0);
+        builder->ClearSharding();
+      } else {
+        *return_value = operand;
+      }
     }
 
     return success();
@@ -1695,12 +1743,22 @@ LogicalResult ConvertToHloModule::RunOnFunction(mlir::FuncOp f) {
       auto aliasing_output =
           f.getArgAttrOfType<mlir::IntegerAttr>(i, "tf.aliasing_output");
       if (!aliasing_output) continue;
+      xla::ShapeIndex output_index;
+      if ((return_tuple_ && entry_function) || f.getNumResults() != 1) {
+        output_index = {aliasing_output.getInt()};
+      } else {
+        if (aliasing_output.getInt() != 0) {
+          return f.emitError(
+              "Aliasing output must be 0 if only one output exists");
+        }
+        output_index = {};
+      }
       if (use_tuple_args_) {
-        builder.SetUpAlias(/*output_index=*/{aliasing_output.getInt()},
-                           /*param_number=*/0, /*param_index=*/{i});
+        builder.SetUpAlias(output_index, /*param_number=*/0,
+                           /*param_index=*/{i});
       } else {
-        builder.SetUpAlias(/*output_index=*/{aliasing_output.getInt()},
-                           /*param_number=*/i, /*param_index=*/{});
+        builder.SetUpAlias(output_index, /*param_number=*/i,
+                           /*param_index=*/{});
       }
     }
     // Do not populate this field when nothing is replicated, since empty field
@@ -1826,6 +1884,9 @@ LogicalResult ConvertToHloModule::LowerBasicBlockAsFunction(
     for (BlockArgument& arg : block->getArguments()) {
       auto num = arg.getArgNumber();
       xla::Shape shape = xla::TypeToShape(arg.getType());
+      if (!arg_shardings.empty() && arg_shardings[num]) {
+        builder->SetSharding(*arg_shardings[num]);
+      }
       if (entry_args_same_across_replicas.empty()) {
         lowering[arg] =
             xla::Parameter(builder, num, shape, absl::StrCat("Arg_", num));
@@ -1835,6 +1896,7 @@ LogicalResult ConvertToHloModule::LowerBasicBlockAsFunction(
             std::vector<bool>(entry_args_same_across_replicas[num],
                               xla::ShapeUtil::GetLeafCount(shape)));
       }
+      builder->ClearSharding();
     }
   }
 
diff --git a/tensorflow/compiler/mlir/xla/tests/BUILD b/tensorflow/compiler/mlir/xla/tests/BUILD
index 48d4baa2ef7d18..bae69f1315a7fd 100644
--- a/tensorflow/compiler/mlir/xla/tests/BUILD
+++ b/tensorflow/compiler/mlir/xla/tests/BUILD
@@ -1,60 +1,12 @@
 load("//tensorflow:tensorflow.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load(
-    "//tensorflow/core/platform:build_config_root.bzl",
-    "tf_cuda_tests_tags",
-)
 
 package(licenses = ["notice"])
 
 glob_lit_tests(
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
-    tags_override = {
-        "adjust-layout.mlir": tf_cuda_tests_tags() + [
-            "nomsan",  # TODO(b/181135145)
-        ],
-        "legalize-tf.mlir": tf_cuda_tests_tags() + [
-            "nomsan",  # TODO(b/181135145)
-        ],
-        "legalize-tf-BatchMatMulV2.mlir": tf_cuda_tests_tags() + [
-            "nomsan",  # TODO(b/181135145)
-        ],
-        "legalize-tf-binary-elementwise.mlir": tf_cuda_tests_tags() + [
-            "nomsan",  # TODO(b/181135145)
-        ],
-        "legalize-tf-collective.mlir": tf_cuda_tests_tags() + [
-            "nomsan",  # TODO(b/181135145)
-        ],
-        "legalize-tf-communication.mlir": tf_cuda_tests_tags() + [
-            "nomsan",  # TODO(b/181135145)
-        ],
-        "legalize-tf-control-flow.mlir": tf_cuda_tests_tags() + [
-            "nomsan",  # TODO(b/181135145)
-        ],
-        "legalize-tf-full-conversion.mlir": tf_cuda_tests_tags() + [
-            "nomsan",  # TODO(b/181135145)
-        ],
-        "legalize-tf-include-tf2xla-fallback.mlir": tf_cuda_tests_tags() + [
-            "nomsan",  # TODO(b/181135145)
-        ],
-        "legalize-tf-no-tf2xla-fallback.mlir": tf_cuda_tests_tags() + [
-            "nomsan",  # TODO(b/181135145)
-        ],
-        "legalize-tf-prefer-tf2xla.mlir": tf_cuda_tests_tags() + [
-            "nomsan",  # TODO(b/181135145)
-        ],
-        "legalize-tf-types.mlir": tf_cuda_tests_tags() + [
-            "nomsan",  # TODO(b/181135145)
-        ],
-        "legalize-tf-with-tf2xla.mlir": tf_cuda_tests_tags() + [
-            "nomsan",  # TODO(b/181135145)
-        ],
-        "prepare-for-export.mlir": tf_cuda_tests_tags() + [
-            "nomsan",  # TODO(b/181135145)
-        ],
-    },
     test_file_exts = [
         "mlir",
         "hlotxt",
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/BUILD b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/BUILD
index 14802b814c7c36..0b5c5713199113 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/BUILD
@@ -17,18 +17,7 @@ glob_lit_tests(
             "noubsan",
             "no_cuda_asan",
             "no_oss",
-        ],  # b/171751580
-        "ops.mlir": tf_cuda_tests_tags() + [
-            "nomsan",
-            "no_cuda_asan",
-            "no_oss",
-        ],  # b/191025174, b/192521861
-        "passthrough.mlir": tf_cuda_tests_tags() + [
-            "nomsan",
-            "no_cuda_asan",
-            "no_cuda_asan",
-            "no_oss",
-        ],  # b/191025174, b/192521861
+        ],
     },
     test_file_exts = [
         "mlir",
@@ -44,6 +33,7 @@ filegroup(
     data = [
         "//tensorflow/compiler/mlir:tf-mlir-translate",
         "//tensorflow/compiler/mlir/xla:xla-opt",
+        "//tensorflow/compiler/mlir/xla:xla-opt-gpu",
         "@llvm-project//llvm:FileCheck",
         "@llvm-project//llvm:not",
     ],
diff --git a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/gpu_ops.mlir b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/gpu_ops.mlir
index 9a51120665fd60..43fb6b5263c415 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/gpu_ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo_to_lhlo_with_xla/gpu_ops.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-opt -split-input-file "-xla-hlo-to-lhlo-with-xla=platform=CUDA" %s | FileCheck %s
+// RUN: xla-opt-gpu -split-input-file "-xla-hlo-to-lhlo-with-xla=platform=CUDA" %s | FileCheck %s
 
 // CHECK-LABEL: func @main
 // CHECK-SAME: %[[ARG0:.*]]: memref<36xi8> {lmhlo.params = 0
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-no-tf2xla-fallback.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-no-tf2xla-fallback.mlir
index 785cb2354935de..d87489d2dc8db6 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-no-tf2xla-fallback.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-no-tf2xla-fallback.mlir
@@ -5683,7 +5683,7 @@ func @avgpool_grad_bf16(%grad: tensor<10x12x16x64xbf16>) -> tensor<10x24x32x64xb
 
 // CHECK-LABEL: xla_sharding
 func @xla_sharding(%arg0: tensor<4x16xf32>) -> tensor<4x16xf32> {
-  // CHECK-NEXT: "mhlo.custom_call"(%arg0) {api_version = 1 : i32, backend_config = "", call_target_name = "Sharding", has_side_effect = false, mhlo.sharding = ""}
+  // CHECK-NEXT: "mhlo.custom_call"(%arg0) {call_target_name = "Sharding", mhlo.sharding = ""}
   %0 = "tf.XlaSharding"(%arg0) {_XlaSharding = "", sharding = ""} : (tensor<4x16xf32>) -> tensor<4x16xf32>
   return %0 : tensor<4x16xf32>
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index d70b4ce1be3308..d0516cd184bfb8 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -5797,7 +5797,7 @@ func @avgpool_grad_bf16(%grad: tensor<10x12x16x64xbf16>) -> tensor<10x24x32x64xb
 
 // CHECK-LABEL: xla_sharding
 func @xla_sharding(%arg0: tensor<4x16xf32>) -> tensor<4x16xf32> {
-  // CHECK-NEXT: "mhlo.custom_call"(%arg0) {api_version = 1 : i32, backend_config = "", call_target_name = "Sharding", has_side_effect = false, mhlo.sharding = ""}
+  // CHECK-NEXT: "mhlo.custom_call"(%arg0) {call_target_name = "Sharding", mhlo.sharding = ""}
   %0 = "tf.XlaSharding"(%arg0) {_XlaSharding = "", sharding = ""} : (tensor<4x16xf32>) -> tensor<4x16xf32>
   return %0 : tensor<4x16xf32>
 }
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
index 352fa309d2ac57..4e067bfe34fd79 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
@@ -1236,6 +1236,26 @@ func @main(%arg0: tensor<16x16xf32>) -> tensor<16x16xf32> {
 
 // -----
 
+// CHECK:  HloModule
+// CHECK: %[[FOO:.*]] ([[ARG0:.*]]: f32[2,3], [[ARG1:.*]]: f32[5,5]) -> f32[2,3]
+func @foo (%arg0: tensor<2x3xf32>, %arg1: tensor<5x5xf32>) -> tensor<2x3xf32> {
+  return %arg0 : tensor<2x3xf32>
+}
+
+// CHECK: ENTRY
+func @main(%arg0: tensor<2x3xf32>, %arg1: tensor<5x5xf32>) -> tensor<2x3xf32> {
+  // CHECK:  ROOT
+  // CHECK-SAME:  f32[2,3] custom-call
+  // CHECK-SAME:  called_computations={%[[FOO]]}
+  %0 = "mhlo.custom_call"(%arg0, %arg1) {
+    call_target_name = "foo",
+    called_computations = [@foo]
+  } : (tensor<2x3xf32>, tensor<5x5xf32>) -> tensor<2x3xf32>
+  return %0 : tensor<2x3xf32>
+}
+
+// -----
+
 // CHECK:  HloModule
 func @main(%arg0: tensor<2xcomplex<f32>>, %arg1: tensor<2xcomplex<f64>>) -> (tensor<2xf32>, tensor<2xf64>) {
   %0 = "mhlo.abs"(%arg0) : (tensor<2xcomplex<f32>>) -> (tensor<2xf32>)
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
index 9135962912af8f..81fe6ef7b39ce4 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
@@ -50,6 +50,21 @@ ENTRY %dummy_main (Arg_0.1: f32[]) -> f32[] {
   ROOT ag = f32[128,128] all-gather(input), channel_id=1, replica_groups={{0, 2, 4, 6}, {1, 3, 5, 7}}, dimensions={1}
 }
 
+// Test all-to-all
+
+// CHECK-LABEL:  func private @test_all_to_all
+// CHECK-SAME:  ([[ARG:%.*]]: tensor<2x2xi32>)
+%test_all_to_all {
+  %parameter = s32[2,2]{1,0} parameter(0)
+  // CHECK-NEXT: "mhlo.all_to_all"([[ARG]]) {
+  // CHECK-SAME:   concat_dimension = 1 : i64,
+  // CHECK-SAME{LITERAL}:   replica_groups = dense<[[1, 2], [3, 0]]> : tensor<2x2xi64>,
+  // CHECK-SAME:   split_count = 2 : i64,
+  // CHECK-SAME:   split_dimension = 1 : i64
+  // CHECK-SAME: } : (tensor<2x2xi32>) -> tensor<2x2xi32>
+  ROOT %all-to-all = s32[2,2]{1,0} all-to-all(s32[2,2]{1,0} %parameter), replica_groups={{1,2}, {3,0}}, dimensions={1}
+}
+
 // Test all-reduce
 add {
   lhs = f32[] parameter(0)
@@ -335,6 +350,31 @@ add {
   ROOT %custom-call = (f32[1,2,3]{0,2,1}, s32[3,7,9]{2,0,1}) custom-call(f32[2,3] %arg1, f32[5,5] %arg2), custom_call_target="foo", backend_config="bar", custom_call_has_side_effect=true, operand_layout_constraints={f32[2,3]{0,1}, f32[5,5]{1,0}}
 }
 
+// CHECK-LABEL: func private @custom_call_computation_0
+%custom_call_computation_0 (arg_1: s64[]) -> s64[] {
+  %arg_1 = s64[] parameter(0), metadata={op_name="HLO_Args"}
+  ROOT %compare.2 = s64[] add(%arg_1, %arg_1), metadata={op_type="Less" op_name="Less"}
+}
+
+// CHECK-LABEL: func private @custom_call_computation_1
+%custom_call_computation_1 (arg_1: s64[]) -> s64[] {
+  %arg_1 = s64[] parameter(0), metadata={op_name="HLO_Args"}
+  ROOT %compare.2 = s64[] add(%arg_1, %arg_1), metadata={op_type="Less" op_name="Less"}
+}
+
+// CHECK-LABEL:  func private @test_custom_call_with_computations
+// CHECK-SAME:  [[ARG_0:%.*]]: tensor<2x3xf32>, [[ARG_1:%.*]]: tensor<5x5xf32>) -> tensor<1x2x3xf32>
+%test_custom_call_with_computations (arg1: f32[2,3], arg2: f32[5,5]) -> f32[1,2,3] {
+  %arg1 = f32[2,3] parameter(0)
+  %arg2 = f32[5,5] parameter(1)
+  // CHECK:  "mhlo.custom_call"([[ARG_0]], [[ARG_1]]) {
+  // CHECK-SAME: api_version = 1 : i32
+  // CHECK-SAME: call_target_name = "foo"
+  // CHECK-SAME: called_computations = [@custom_call_computation_0, @custom_call_computation_1]
+  // CHECK-SAME: : (tensor<2x3xf32>, tensor<5x5xf32>) -> tensor<1x2x3xf32>
+  ROOT %custom-call = f32[1,2,3]{0,2,1} custom-call(f32[2,3] %arg1, f32[5,5] %arg2), custom_call_target="foo", called_computations={%custom_call_computation_0, %custom_call_computation_1}
+}
+
 // CHECK-LABEL:  func private @test_div(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32>
 %test_div (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[4] {
   %Arg_0.1 = f32[4] parameter(0)
@@ -626,6 +666,17 @@ add {
   ROOT %outfeed.3 = token[] outfeed(s32[3] %Arg_0.1, token[] %Arg_1.2), outfeed_config="foobar"
 }
 
+// CHECK-LABEL:  func private @test_outfeed_with_sharding
+// CHECK-SAME: ([[DATA:%.*]]: tensor<3xi32>, [[TOKEN:%.*]]: !mhlo.token) -> !mhlo.token
+%test_outfeed_with_sharding (Arg_0.1: s32[3], Arg_1.2: token[]) -> token[] {
+  %Arg_0.1 = s32[3] parameter(0)
+  %Arg_1.2 = token[] parameter(1)
+  // CHECK-NEXT:  "mhlo.outfeed"([[DATA]], [[TOKEN]])
+  // CHECK-SAME: mhlo.sharding = "\08\03\1A\02\02\01\22\02\00\01"
+  // CHECK-SAME:  outfeed_config = "foobar"
+  ROOT %outfeed.3 = token[] outfeed(s32[3] %Arg_0.1, token[] %Arg_1.2), outfeed_config="foobar", sharding={devices=[2,1]0,1}
+}
+
 // CHECK-LABEL:  func private @test_pad(%arg0: tensor<4xf32>, %arg1: tensor<f32>) -> tensor<4xf32>
 %test_pad (Arg_0.1: f32[4], Arg_1.2: f32[]) -> f32[4] {
   %Arg_0.1 = f32[4] parameter(0)
@@ -756,6 +807,46 @@ add {
   ROOT %tuple.6 = ((f32[], f32[]), f32[]) tuple(%reduce.1, %sub.5)
 }
 
+// Test reduce-scatter
+
+%reduce_helper_add {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+// CHECK-LABEL:  func private @test_reduce_scatter
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<4x8xf32>)
+%test_reduce_scatter {
+  input = f32[4,8] parameter(0)
+  // CHECK-NEXT: "mhlo.reduce_scatter"([[ARG0]]) ( {
+  // CHECK-NEXT:   ^bb0([[BARG0:%.*]]: tensor<f32>, [[BARG1:%.*]]: tensor<f32>):  // no predecessors
+  // CHECK-NEXT:     [[ADD:%.*]] = mhlo.add [[BARG0]], [[BARG1]] : tensor<f32>
+  // CHECK-NEXT:     "mhlo.return"([[ADD]])
+  // CHECK-NEXT:   }) {
+  // CHECK-SAME{LITERAL}:  replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
+  // CHECK-SAME:           scatter_dimension = 1 : i64
+  // CHECK-SAME: } : (tensor<4x8xf32>) -> tensor<4x4xf32>
+  ROOT ars = f32[4,4] reduce-scatter(input), replica_groups={{0,1}}, dimensions={1}, to_apply=reduce_helper_add
+}
+
+// CHECK-LABEL:  func private @test_reduce_scatter_with_channel
+// CHECK-SAME: ([[ARG0:%.*]]: tensor<4x8xf32>)
+%test_reduce_scatter_with_channel {
+  input = f32[4,8] parameter(0)
+  // CHECK-NEXT: "mhlo.reduce_scatter"([[ARG0]]) ( {
+  // CHECK-NEXT:   ^bb0([[BARG0:%.*]]: tensor<f32>, [[BARG1:%.*]]: tensor<f32>):  // no predecessors
+  // CHECK-NEXT:     [[ADD:%.*]] = mhlo.add [[BARG0]], [[BARG1]] : tensor<f32>
+  // CHECK-NEXT:     "mhlo.return"([[ADD]])
+  // CHECK-NEXT:   }) {
+  // CHECK-SAME:           channel_handle = {handle = 1 : i64, type = 0 : i64}
+  // CHECK-SAME{LITERAL}:  replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
+  // CHECK-SAME:           scatter_dimension = 1 : i64
+  // CHECK-SAME: } : (tensor<4x8xf32>) -> tensor<4x4xf32>
+  ROOT ars = f32[4,4] reduce-scatter(input), channel_id=1, replica_groups={{0,1}}, dimensions={1}, to_apply=reduce_helper_add
+}
+
+
 // CHECK-LABEL:  func private @test_reduce_window
 // CHECK-SAME: ([[ARG0:%.*]]: tensor<2x17x31x7xf32>, [[ARG1:%.*]]: tensor<f32>)
 %test_reduce_window (Arg_0.1: f32[2,17,31,7], Arg_1.2: f32[]) -> f32[2,5,8,7] {
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/input_output_aliasing.mlir b/tensorflow/compiler/mlir/xla/tests/translate/input_output_aliasing.mlir
index 5e4b0c93a7ee88..69850730f6a604 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/input_output_aliasing.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/input_output_aliasing.mlir
@@ -1,10 +1,13 @@
 // RUN: tf-mlir-translate -mlir-hlo-to-hlo-text -emit-return-tuple %s | FileCheck %s
 // RUN: tf-mlir-translate -mlir-hlo-to-hlo-text -emit-use-tuple-args -emit-return-tuple %s | FileCheck %s --check-prefix=TUPLE-ARG
+// RUN: tf-mlir-translate -mlir-hlo-to-hlo-text  %s | FileCheck %s --check-prefix=NO-RETURN-TUPLE
 
 // CHECK-LABEL: ENTRY %main
 // CHECK: // OutputIndex {0} aliases with input 0 at {}
 // TUPLE-ARG-LABEL: ENTRY %main
 // TUPLE-ARG: // OutputIndex {0} aliases with input 0 at {0}
+// NO-RETURN-TUPLE-LABEL: ENTRY %main
+// NO-RETURN-TUPLE: // OutputIndex {} aliases with input 0 at {}
 func @main(%arg0: tensor<1xf32> {tf.aliasing_output = 0 : i64}) -> (tensor<1xf32>) {
   %0 = mhlo.constant dense<4.200000e+01> : tensor<1xf32>
   %1 = mhlo.add %arg0, %0 : tensor<1xf32>
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/sharding.mlir b/tensorflow/compiler/mlir/xla/tests/translate/sharding.mlir
new file mode 100644
index 00000000000000..7c0d0f52d41dd4
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/translate/sharding.mlir
@@ -0,0 +1,13 @@
+// RUN: tf-mlir-translate -mlir-hlo-to-hlo-text %s | FileCheck %s
+
+// CHECK-LABEL: ENTRY %main.{{.*}} (Arg_0.1: f32[], Arg_1.2: f32[4]) -> f32[4,4]
+func public @main(%arg0: tensor<f32> {mhlo.sharding = ""}, %arg1: tensor<4xf32> {mhlo.sharding = "\08\03\1A\01\02\22\02\00\01"}) -> (tensor<4x4xf32> {mhlo.sharding = "\08\03\1A\02\02\01\22\02\00\01"}) {
+  // CHECK-NEXT: %Arg_1.2 = f32[4] parameter(1), sharding={devices=[2]0,1}
+  // CHECK-NEXT: %Arg_0.1 = f32[] parameter(0), sharding={replicated}
+  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>) -> tensor<4xf32>
+  %1 = mhlo.multiply %arg1, %0 : tensor<4xf32>
+  %2 = "mhlo.broadcast_in_dim"(%1) {broadcast_dimensions = dense<0> : tensor<1xi64>} : (tensor<4xf32>) -> tensor<4x4xf32>
+  // CHECK: ROOT {{.*}}, sharding={devices=[2,1]0,1}
+  return %2 : tensor<4x4xf32>
+}
+
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index 7bf1e30c25e669..15fe3cd84680aa 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -3069,7 +3069,7 @@ class ConvertSelectOp : public OpRewritePattern<TF::SelectOp> {
         b.create<shape::AssumingOp>(ArrayRef<Type>{result_type}, assumption);
 
     OpBuilder::InsertionGuard guard(b);
-    b.createBlock(&assuming_op.doRegion());
+    b.createBlock(&assuming_op.getDoRegion());
 
     // Broadcast the cond if necessary.
     Value cond = op.condition();
@@ -3259,7 +3259,7 @@ static void BroadcastBatchMatMulV2Operands(Value lhs, Value rhs, Location loc,
       lhs_type.getShape().drop_back(2), rhs_type.getShape().drop_back(2),
       result_batch_shape_compile_time_extents);
   auto result_batch_shape = rewriter->create<shape::BroadcastOp>(
-      loc, shape_type, lhs_splitted.head(), rhs_splitted.head(),
+      loc, shape_type, lhs_splitted.getHead(), rhs_splitted.getHead(),
       /*error=*/nullptr);
   // Lambda which handles the broadcasting of one side to the common
   // leading-batch dimensions.
@@ -3280,8 +3280,8 @@ static void BroadcastBatchMatMulV2Operands(Value lhs, Value rhs, Location loc,
     *out_side = rewriter->create<TF::BroadcastToOp>(loc, result_type, side,
                                                     shape_tensor);
   };
-  broadcast_one_side(lhs, lhs_type, lhs_splitted.tail(), out_lhs);
-  broadcast_one_side(rhs, rhs_type, rhs_splitted.tail(), out_rhs);
+  broadcast_one_side(lhs, lhs_type, lhs_splitted.getTail(), out_lhs);
+  broadcast_one_side(rhs, rhs_type, rhs_splitted.getTail(), out_rhs);
 }
 
 class ConvertBatchMatMulV2Op : public OpRewritePattern<TF::BatchMatMulV2Op> {
@@ -6053,15 +6053,12 @@ class ConvertXlaShardingOp : public OpRewritePattern<TF::XlaShardingOp> {
     // using a string.
     if (!op._XlaSharding().hasValue()) return failure();
 
-    mlir::ArrayAttr empty_layout_attr;
+    NamedAttribute call_target_name = rewriter.getNamedAttr(
+        "call_target_name", rewriter.getStringAttr("Sharding"));
+
     auto custom_call = rewriter.create<mhlo::CustomCallOp>(
         op.getLoc(), op.getType(), op.input(),
-        /*call_target_name=*/"Sharding",
-        /*has_side_effect=*/false,
-        /*backend_config=*/"",
-        /*api_version=*/mhlo::CustomCallApiVersion::API_VERSION_ORIGINAL,
-        /*operand_layouts=*/empty_layout_attr,
-        /*result_layouts=*/empty_layout_attr);
+        ArrayRef<NamedAttribute>{call_target_name});
     custom_call->setAttr(kShardingAttr, op._XlaShardingAttr());
     rewriter.replaceOp(op, custom_call.getResult(0));
 
@@ -6355,7 +6352,7 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
                                       &rewriter);
 
     auto reduce = rewriter.create<ReduceWindowOp>(
-        op.getLoc(), input_type, input, init,
+        op.getLoc(), input.getType(), input, init,
         GetI64ElementsAttr(rewriter.getI64ArrayAttr(window_dims)),
         GetI64ElementsAttr(rewriter.getI64ArrayAttr(window_strides)),
         /*base_dilations=*/DenseIntElementsAttr(),
@@ -6374,7 +6371,7 @@ class ConvertCumOp : public OpRewritePattern<OpT> {
       low_padding[axis] = 1;
       high_padding[axis] = -1;
       result = rewriter.create<PadOp>(
-          op.getLoc(), op.getType(), result, init,
+          op.getLoc(), result.getType(), result, init,
           GetI64ElementsAttr(low_padding, &rewriter),
           GetI64ElementsAttr(high_padding, &rewriter),
           GetI64ElementsAttr(interior_padding, &rewriter));
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 554c5ef3d97e2c..5f58d451929c03 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -2084,16 +2084,12 @@ tf_xla_py_test(
     name = "repeat_op_test",
     size = "medium",
     srcs = ["repeat_op_test.py"],
-    # Where op doesn't yet support these backends.
-    disabled_backends = [
-        "cpu",
-        "cpu_ondemand",
-        "gpu",
-    ],
     enable_mlir_bridge = True,
     shard_count = 5,
     tags = [
-        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+        "no_oss",
+        "no_pip",
+        "notap",  # TODO(b/208452143): Fix the test.
         "optonly",
     ],
     deps = [
@@ -2126,16 +2122,13 @@ tf_xla_py_test(
     name = "where_op_test",
     size = "small",
     srcs = ["where_op_test.py"],
-    disabled_backends = [
-        "cpu",
-        "cpu_ondemand",
-        "gpu",
-    ],
     enable_mlir_bridge = False,
+    tags = [
+        "no_pip",
+        "optonly",
+    ],
     deps = [
         ":xla_test",
-        "//tensorflow/compiler/jit",
-        "//tensorflow/contrib/tpu",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
diff --git a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
index e2f9da62c79cc2..8cc8a61353e29b 100644
--- a/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
+++ b/tensorflow/compiler/tests/matrix_triangular_solve_op_test.py
@@ -28,8 +28,9 @@
 from tensorflow.python.platform import test
 
 
-def MakePlaceholder(x):
-  return array_ops.placeholder(dtypes.as_dtype(x.dtype), shape=x.shape)
+def MakePlaceholder(x, dtype=None):
+  return array_ops.placeholder(
+      dtypes.as_dtype(x.dtype) if dtype is None else dtype, shape=x.shape)
 
 
 class MatrixTriangularSolveOpTest(xla_test.XLATestCase):
@@ -51,12 +52,12 @@ def _VerifyTriangularSolveBase(self, sess, placeholder_a, placeholder_ca,
     broadcasted_b = b + np.zeros(shape=broadcasted_shape, dtype=b.dtype)
     self.assertAllClose(broadcasted_b, verification_np, atol=atol)
 
-  def _VerifyTriangularSolve(self, a, b, lower, adjoint, atol):
+  def _VerifyTriangularSolve(self, a, b, lower, adjoint, atol, dtype=None):
     clean_a = np.tril(a) if lower else np.triu(a)
     with self.session() as sess:
-      placeholder_a = MakePlaceholder(a)
-      placeholder_ca = MakePlaceholder(clean_a)
-      placeholder_b = MakePlaceholder(b)
+      placeholder_a = MakePlaceholder(a, dtype)
+      placeholder_ca = MakePlaceholder(clean_a, dtype)
+      placeholder_b = MakePlaceholder(b, dtype)
       with self.test_scope():
         x = linalg_ops.matrix_triangular_solve(
             placeholder_a, placeholder_b, lower=lower, adjoint=adjoint)
@@ -66,11 +67,11 @@ def _VerifyTriangularSolve(self, a, b, lower, adjoint, atol):
                                       placeholder_b, a, clean_a, b,
                                       verification, atol)
 
-  def _VerifyTriangularSolveCombo(self, a, b, atol=1e-4):
+  def _VerifyTriangularSolveCombo(self, a, b, atol=1e-4, dtype=None):
     transp = lambda x: np.swapaxes(x, -1, -2)
     for lower, adjoint in itertools.product([True, False], repeat=2):
       self._VerifyTriangularSolve(
-          a if lower else transp(a), b, lower, adjoint, atol)
+          a if lower else transp(a), b, lower, adjoint, atol, dtype=dtype)
 
   def testBasic(self):
     rng = np.random.RandomState(0)
@@ -79,6 +80,12 @@ def testBasic(self):
     for dtype in self.float_types:
       self._VerifyTriangularSolveCombo(a.astype(dtype), b.astype(dtype))
 
+  def testBfloat16(self):
+    rng = np.random.RandomState(0)
+    a = np.tril(rng.randn(5, 5))
+    b = rng.randn(5, 7)
+    self._VerifyTriangularSolveCombo(a, b, atol=5e-2, dtype=dtypes.bfloat16)
+
   def testBasicNotActuallyTriangular(self):
     rng = np.random.RandomState(0)
     a = rng.randn(5, 5)  # the `a` matrix is not lower-triangular
diff --git a/tensorflow/compiler/tests/scan_ops_test.py b/tensorflow/compiler/tests/scan_ops_test.py
index bc10f4b38ff701..a2e967d8e9a20e 100644
--- a/tensorflow/compiler/tests/scan_ops_test.py
+++ b/tensorflow/compiler/tests/scan_ops_test.py
@@ -18,6 +18,7 @@
 
 from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -126,6 +127,14 @@ def test6D(self):
       for axis in range(-6, 6, 3):
         self._compareAll(x, axis)
 
+  def testMixedPrecision(self):
+    with self.session(), self.test_scope():
+      y = math_ops.cumsum(
+          constant_op.constant([1., 2., 3., 4.], dtypes.bfloat16),
+          -1,
+          exclusive=True).eval()
+    self.assertAllEqual(y, [0., 1., 3., 6.])
+
   @test_util.disable_mlir_bridge("Error handling")
   def testInvalidAxis(self):
     x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py
index 0ec7ae97fc38f1..8b4e38fed5aa66 100644
--- a/tensorflow/compiler/tests/xla_ops_test.py
+++ b/tensorflow/compiler/tests/xla_ops_test.py
@@ -20,6 +20,7 @@
 import numpy as np
 
 from tensorflow.compiler.tests import xla_test
+from tensorflow.compiler.tf2xla.ops import gen_xla_ops
 from tensorflow.compiler.tf2xla.python import xla
 from tensorflow.compiler.xla import xla_data_pb2
 from tensorflow.python.eager import def_function
@@ -385,7 +386,10 @@ def fn(x):
           args=(np.arange(12, dtype=np.int32).astype(dtype).reshape([3, 4]),),
           expected=np.array([0, 45, 120, 231], dtype=dtype))
 
-  def testVariadicReduceKahanSum(self):
+  IS_XLA_VARIADIC_REDUCE_V2 = [True, False]
+
+  @parameterized.parameters(IS_XLA_VARIADIC_REDUCE_V2)
+  def testVariadicReduceKahanSum(self, is_v2):
     for dtype in set(self.numeric_types).intersection(
         set([np.float32, np.complex64])):
 
@@ -405,56 +409,72 @@ def fn(x):
           reducer = kahan_sum_reducer.get_concrete_function(
               (arg, arg), (arg, arg))
 
-          return xla.variadic_reduce((x, array_ops.zeros_like(x)),
-                                     init_values=(arg, arg),
-                                     dimensions_to_reduce=dims,
-                                     reducer=reducer)[output_idx]
+          if is_v2:
+            return xla.variadic_reduce((x, array_ops.zeros_like(x)),
+                                       init_values=(arg, arg),
+                                       dimensions_to_reduce=dims,
+                                       reducer=reducer)[output_idx]
+          else:
+            return gen_xla_ops.xla_variadic_reduce((x, array_ops.zeros_like(x)),
+                                                   init_value=(arg, arg),
+                                                   dimensions_to_reduce=dims,
+                                                   reducer=reducer)[output_idx]
+
         return fn
 
       xs = np.array([1e5, np.pi, -1e5, np.exp(1.)])
       xs = np.array([xs, xs[::-1] / 3, xs / 7], dtype)
       self._assertOpOutputMatchesExpected(
-          kahan_sum_reduction(dims=[], output_idx=0),
-          args=(xs,), expected=xs)
+          kahan_sum_reduction(dims=[], output_idx=0), args=(xs,), expected=xs)
       self._assertOpOutputMatchesExpected(
           kahan_sum_reduction(dims=[], output_idx=1),
-          args=(xs,), expected=np.zeros_like(xs))
+          args=(xs,),
+          expected=np.zeros_like(xs))
       shuffle_indices = np.argsort(np.random.randn(xs.shape[0]))
       self._assertOpOutputMatchesExpected(
           kahan_sum_reduction(dims=[0], output_idx=0),
           args=(xs[shuffle_indices],),
-          expected=np.array([np.exp(1) / 3 + 1e5 * 8 / 7,
-                             np.pi * 8 / 7 - 1e5 / 3,
-                             -1e5 * 8 / 7 + np.pi / 3,
-                             np.exp(1) * 8 / 7 + 1e5 / 3], dtype=dtype))
+          expected=np.array([
+              np.exp(1) / 3 + 1e5 * 8 / 7, np.pi * 8 / 7 - 1e5 / 3,
+              -1e5 * 8 / 7 + np.pi / 3,
+              np.exp(1) * 8 / 7 + 1e5 / 3
+          ],
+                            dtype=dtype))
       error_term_equality = functools.partial(self.assertAllClose, atol=.005)
       self._assertOpOutputMatchesExpected(
           kahan_sum_reduction(dims=[0], output_idx=1),
-          args=(xs[shuffle_indices],), expected=np.zeros_like(xs[0]),
+          args=(xs[shuffle_indices],),
+          expected=np.zeros_like(xs[0]),
           equality_fn=error_term_equality)
       shuffle_indices = np.argsort(np.random.randn(xs.shape[1]))
       self._assertOpOutputMatchesExpected(
           kahan_sum_reduction(dims=[1], output_idx=0),
           args=(xs[:, shuffle_indices],),
-          expected=np.array([np.pi + np.exp(1.),
-                             (np.pi + np.exp(1.)) / 3,
-                             (np.pi + np.exp(1.)) / 7], dtype=dtype))
+          expected=np.array([
+              np.pi + np.exp(1.), (np.pi + np.exp(1.)) / 3,
+              (np.pi + np.exp(1.)) / 7
+          ],
+                            dtype=dtype))
       self._assertOpOutputMatchesExpected(
           kahan_sum_reduction(dims=[1], output_idx=1),
-          args=(xs[:, shuffle_indices],), expected=np.zeros_like(xs[:, 0]),
+          args=(xs[:, shuffle_indices],),
+          expected=np.zeros_like(xs[:, 0]),
           equality_fn=error_term_equality)
       # Now, shuffle both dims.
       xs = xs[np.argsort(np.random.randn(xs.shape[0]))]
       xs = xs[:, np.argsort(np.random.randn(xs.shape[1]))]
       self._assertOpOutputMatchesExpected(
           kahan_sum_reduction(dims=[0, 1], output_idx=0),
-          args=(xs,), expected=dtype((np.pi + np.exp(1.)) * 31 / 21))
+          args=(xs,),
+          expected=dtype((np.pi + np.exp(1.)) * 31 / 21))
       self._assertOpOutputMatchesExpected(
           kahan_sum_reduction(dims=[0, 1], output_idx=1),
-          args=(xs,), expected=dtype(0),
+          args=(xs,),
+          expected=dtype(0),
           equality_fn=error_term_equality)
 
-  def testVariadicReduceV2SingleOp(self):
+  @parameterized.parameters(IS_XLA_VARIADIC_REDUCE_V2)
+  def testVariadicReduceSingleOp(self, is_v2):
 
     @def_function.function
     def reducer_add(op_element, acc_val):
@@ -467,9 +487,19 @@ def reducer_add(op_element, acc_val):
       reducer_func = reducer_add.get_concrete_function(arg_spec, arg_spec)
 
       def reduce(values, *, dimensions_to_reduce):
-        return xla.variadic_reduce((values,), (init_val,),  # pylint: disable=cell-var-from-loop
-                                   dimensions_to_reduce=dimensions_to_reduce,
-                                   reducer=reducer_func)[0]  # pylint: disable=cell-var-from-loop
+        if is_v2:
+          return xla.variadic_reduce(
+              (values,),
+              (init_val,),  # pylint: disable=cell-var-from-loop
+              dimensions_to_reduce=dimensions_to_reduce,
+              reducer=reducer_func)[0]  # pylint: disable=cell-var-from-loop
+        else:
+          return gen_xla_ops.xla_variadic_reduce(
+              (values,),
+              (init_val,),  # pylint: disable=cell-var-from-loop
+              dimensions_to_reduce=dimensions_to_reduce,
+              reducer=reducer_func)[0]  # pylint: disable=cell-var-from-loop
+
       # Reduce dimension 0
       self._assertOpOutputMatchesExpected(
           functools.partial(reduce, dimensions_to_reduce=(0,)),
@@ -507,9 +537,14 @@ def reducer_add(op_element_1, op_element_2, acc_val_1, acc_val_2):
                                                        arg_spec_1, arg_spec_2)  # pylint: disable=cell-var-from-loop
 
       def reduce(*values, dimensions_to_reduce):
-        return xla.variadic_reduce(values, (init_val_1, init_val_2,),  # pylint: disable=cell-var-from-loop
-                                   dimensions_to_reduce=dimensions_to_reduce,
-                                   reducer=reducer_func)  # pylint: disable=cell-var-from-loop
+        return xla.variadic_reduce(
+            values,
+            (
+                init_val_1,  # pylint: disable=cell-var-from-loop
+                init_val_2,  # pylint: disable=cell-var-from-loop
+            ),
+            dimensions_to_reduce=dimensions_to_reduce,
+            reducer=reducer_func)  # pylint: disable=cell-var-from-loop
 
       # Reduce dimension 0
       self._assertOpOutputMatchesExpected(
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index 3b6ffd17dd74d3..7a123af00ce888 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -20,7 +20,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
 # buildifier: disable=same-origin-load
-load("//tensorflow:tensorflow.bzl", "pybind_extension")
+load("//tensorflow:tensorflow.bzl", "pybind_ccsharedlib_extension")
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs")
@@ -576,6 +576,7 @@ tf_cuda_library(
     srcs = [
         "convert/convert_graph.cc",
         "convert/convert_nodes.cc",
+        "convert/ops/data_format_vec_permute.cc",
         "convert/ops/slice_ops.cc",
         "convert/trt_optimization_pass.cc",
     ],
@@ -820,11 +821,49 @@ cc_library(
     ]),
 )
 
-pybind_extension(
+pybind_ccsharedlib_extension(
     name = "_pywrap_py_utils",
     srcs = ["utils/py_utils_wrapper.cc"],
     link_in_framework = True,
     module_name = "_pywrap_py_utils",
+    static_deps = [
+        # copybara:comment_begin(oss only)
+        "@bazel_tools//:__subpackages__",
+        "@boringssl//:__subpackages__",
+        "@com_github_googlecloudplatform_tensorflow_gcp_tools//:__subpackages__",
+        "@com_google_absl//:__subpackages__",
+        "@com_google_googleapis//:__subpackages__",
+        "@com_google_protobuf//:__subpackages__",
+        "@com_googlesource_code_re2//:__subpackages__",
+        "@curl//:__subpackages__",
+        "@double_conversion//:__subpackages__",
+        "@eigen_archive//:__subpackages__",
+        "@farmhash_archive//:__subpackages__",
+        "@fft2d//:__subpackages__",
+        "@gif//:__subpackages__",
+        "@highwayhash//:__subpackages__",
+        "@hwloc//:__subpackages__",
+        "@jsoncpp_git//:__subpackages__",
+        "@libjpeg_turbo//:__subpackages__",
+        "@libxsmm_archive//:__subpackages__",
+        "@llvm_openmp//:__subpackages__",
+        "@llvm-project//:__subpackages__",
+        "@llvm_terminfo//:__subpackages__",
+        "@llvm_zlib//:__subpackages__",
+        "@local_config_cuda//:__subpackages__",
+        "@local_config_git//:__subpackages__",
+        "@local_config_python//:__subpackages__",
+        "@local_config_rocm//:__subpackages__",
+        "@local_config_tensorrt//:__subpackages__",
+        "@local_execution_config_platform//:__subpackages__",
+        "@nsync//:__subpackages__",
+        "@platforms//:__subpackages__",
+        "@pybind11//:__subpackages__",
+        "@snappy//:__subpackages__",
+        "//:__subpackages__",
+        "@zlib//:__subpackages__",
+        # copybara:comment_end
+    ],
     deps = [
         ":common_utils",
         ":py_utils",
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index 0c66daaffecee9..5cc123fdddc31c 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -1411,6 +1411,19 @@ class TftrtAlgorithmSelector : public nvinfer1::IAlgorithmSelector {
 };
 #endif  // #if IS_TRT_VERSION_GE(7, 2, 3, 4)
 
+// Returns the value of TF_TRT_ABORT_CUDA_ENGINE_BUILD environment variable.
+// This variable can be used to abort CUDA engine construction, therefore it
+// provides a way to test and debug the native segment fallback of TF-TRT.
+bool AbortCudaEngineBuild() {
+  bool value;
+  Status status = ReadBoolFromEnvVar("TF_TRT_ABORT_CUDA_ENGINE_BUILD",
+                                     /*default_value=*/false, &value);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return value;
+}
+
 Status Converter::BuildCudaEngine(
     TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, int max_batch_size,
     size_t max_workspace_size_bytes, nvinfer1::IGpuAllocator* allocator,
@@ -1422,6 +1435,11 @@ Status Converter::BuildCudaEngine(
       },
       tensorflow::profiler::TraceMeLevel::kInfo);
 
+  if (AbortCudaEngineBuild()) {
+    return errors::Aborted(
+        "Engine creation aborted by TF_TRT_ABORT_CUDA_ENGINE_BUILD variable");
+  }
+
   VLOG(1) << "Configuring TensorRT builder";
   trt_builder_->setMaxBatchSize(max_batch_size);
   trt_builder_->setGpuAllocator(allocator);
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index be333c5975918c..d1db78ce7fd2c6 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -1763,6 +1763,8 @@ class OpConverter_FP32_FP16_Test : public ParameterizedOpConverterTestBase {};
 // Base class for tests that need to be tested for FP32, FP16, and INT32
 class OpConverter_FP32_FP16_INT32_Test
     : public ParameterizedOpConverterTestBase {};
+// Base class for tests that need to be tested for INT32
+class OpConverter_INT32_Test : public ParameterizedOpConverterTestBase {};
 
 // Instantiate parameter combinations to OpConverter_<DT_X...>_Test
 INSTANTIATE_TEST_CASE_P(
@@ -1783,6 +1785,12 @@ INSTANTIATE_TEST_CASE_P(
                        ::testing::Values(DT_FLOAT, DT_HALF, DT_INT32),
                        ::testing::Values(TrtPrecisionMode::FP32)));
 
+INSTANTIATE_TEST_CASE_P(
+    OpConvTestInstantiation, OpConverter_INT32_Test,
+    ::testing::Combine(::testing::ValuesIn(ValidTrtModes),
+                       ::testing::Values(DT_INT32),
+                       ::testing::Values(TrtPrecisionMode::FP32)));
+
 template <typename T>
 void CopyTensorElements(const Tensor& tensor, protobuf::RepeatedField<T>* out) {
   out->Clear();
@@ -5920,6 +5928,180 @@ TEST_P(OpConverter_FP32_FP16_Test, ConvertTopK) {
   }
 }
 
+struct DataFormatVecPermuteTestParams {
+  string dst_format;
+  string src_format;
+  std::vector<int> x_shape;
+  std::vector<int> x;
+  bool x_is_tensor;
+  std::vector<int> expected_output;
+  Status conversion_status;
+};
+
+NodeDef GetDataFormatVecPermuteNodeDef(string dst_format, string src_format,
+                                       std::vector<int>& x_shape) {
+  Scope s = Scope::NewRootScope();
+  PartialTensorShape tensor_shape;
+  auto x = ops::Placeholder(s.WithOpName("x"), DT_INT32);
+  const auto attrs = ops::DataFormatVecPermute::Attrs()
+                         .DstFormat(dst_format)
+                         .SrcFormat(src_format);
+  auto dfvp = ops::DataFormatVecPermute(s.WithOpName("my_dfvp"), x, attrs);
+  return dfvp.operation.node()->def();
+}
+
+TEST_P(OpConverter_INT32_Test, ConvertDataFormatVecPermute) {
+  Status implicit_error = Status{
+      error::UNIMPLEMENTED, "Implicit batch mode not supported, at my_dfvp"};
+
+  std::vector<DataFormatVecPermuteTestParams> test_params = {
+      // 1D case with tensor.
+      DataFormatVecPermuteTestParams{
+          /*dst_format=*/"NCHW",
+          /*src_format=*/"NHWC",
+          /*x_shape=*/{4},
+          /*x=*/{1, 2, 3, 4},
+          /*x_is_tensor=*/true,
+          /*expected_output=*/{1, 4, 2, 3},
+          /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+              ? implicit_error
+              : Status::OK()},
+      // 1D case with weights.
+      DataFormatVecPermuteTestParams{
+          /*dst_format=*/"NCHW",
+          /*src_format=*/"NHWC",
+          /*x_shape=*/{4},
+          /*x=*/{1, 2, 3, 4},
+          /*x_is_tensor=*/false,
+          /*expected_output=*/{1, 4, 2, 3},
+          /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+              ? implicit_error
+              : Status::OK()},
+      // 2D case with tensor.
+      DataFormatVecPermuteTestParams{
+          /*dst_format=*/"NCHW",
+          /*src_format=*/"NHWC",
+          /*x_shape=*/{4, 2},
+          /*x=*/{1, 2, 3, 4, 5, 6, 7, 8},
+          /*x_is_tensor=*/true,
+          /*expected_output=*/{1, 2, 7, 8, 3, 4, 5, 6},
+          /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+              ? implicit_error
+              : Status::OK()},
+      // 2D case with weights.
+      DataFormatVecPermuteTestParams{
+          /*dst_format=*/"NCHW",
+          /*src_format=*/"NHWC",
+          /*x_shape=*/{4, 2},
+          /*x=*/{1, 2, 3, 4, 5, 6, 7, 8},
+          /*x_is_tensor=*/false,
+          /*expected_output=*/{1, 2, 7, 8, 3, 4, 5, 6},
+          /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+              ? implicit_error
+              : Status::OK()},
+      // Format of size 5.
+      DataFormatVecPermuteTestParams{
+          /*dst_format=*/"NCDHW",
+          /*src_format=*/"NDHWC",
+          /*x_shape=*/{5, 2},
+          /*x=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+          /*x_is_tensor=*/true,
+          /*expected_output=*/{1, 2, 9, 10, 3, 4, 5, 6, 7, 8},
+          /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+              ? implicit_error
+              : Status::OK()},
+      // Input of size 2: treat the elements as spatial dimensions.
+      DataFormatVecPermuteTestParams{
+          /*dst_format=*/"NCWH",
+          /*src_format=*/"NHWC",
+          /*x_shape=*/{2, 2},
+          /*x=*/{1, 2, 3, 4},
+          /*x_is_tensor=*/true,
+          /*expected_output=*/{3, 4, 1, 2},
+          /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+              ? implicit_error
+              : Status::OK()},
+      // Input of size 3: treat the elements as spatial dimensions.
+      DataFormatVecPermuteTestParams{
+          /*dst_format=*/"NCHWD",
+          /*src_format=*/"NDHWC",
+          /*x_shape=*/{3},
+          /*x=*/{1, 2, 3},
+          /*x_is_tensor=*/true,
+          /*expected_output=*/{2, 3, 1},
+          /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+              ? implicit_error
+              : Status::OK()},
+      // Invalid rank, should fail.
+      DataFormatVecPermuteTestParams{
+          /*dst_format=*/"NCHW",
+          /*src_format=*/"NHWC",
+          /*x_shape=*/{2, 2, 2},
+          /*x=*/{1, 2, 3, 4, 5, 6, 7, 8},
+          /*x_is_tensor=*/true,
+          /*expected_output=*/{},
+          /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+              ? implicit_error
+              : Status{error::INVALID_ARGUMENT,
+                       "Input must be a vector or matrix, but got rank 3, at "
+                       "my_dfvp"}},
+      // Invalid size for 1D input, should fail.
+      DataFormatVecPermuteTestParams{
+          /*dst_format=*/"NCHW",
+          /*src_format=*/"NHWC",
+          /*x_shape=*/{3},
+          /*x=*/{1, 2, 3},
+          /*x_is_tensor=*/true,
+          /*expected_output=*/{},
+          /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+              ? implicit_error
+              : Status{error::INVALID_ARGUMENT,
+                       "1D input must be of size 2 or 4, but got size 3, at "
+                       "my_dfvp"}},
+      // Invalid first dim for 2D input, should fail.
+      DataFormatVecPermuteTestParams{
+          /*dst_format=*/"NCDHW",
+          /*src_format=*/"NDHWC",
+          /*x_shape=*/{4, 2},
+          /*x=*/{1, 2, 3, 4, 5, 6, 7, 8},
+          /*x_is_tensor=*/true,
+          /*expected_output=*/{},
+          /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+              ? implicit_error
+              : Status{error::INVALID_ARGUMENT,
+                       "First dimension of 2D input must be of size 3 or 5, "
+                       "but got shape (4, 2), at my_dfvp"}},
+      // Invalid second dim for 2D input, should fail.
+      DataFormatVecPermuteTestParams{
+          /*dst_format=*/"NCHW",
+          /*src_format=*/"NHWC",
+          /*x_shape=*/{4, 3},
+          /*x=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
+          /*x_is_tensor=*/true,
+          /*expected_output=*/{},
+          /*conversion_status=*/trt_mode_ == TrtTestMode::kImplicitBatch
+              ? implicit_error
+              : Status{error::INVALID_ARGUMENT,
+                       "Second dimension of 2D input must be of size 2, but "
+                       "got shape (4, 3), at my_dfvp"}},
+  };
+
+  for (auto p : test_params) {
+    Reset();
+    const NodeDef node_def =
+        GetDataFormatVecPermuteNodeDef(p.dst_format, p.src_format, p.x_shape);
+
+    if (p.x_is_tensor) {
+      AddTestTensor("x", p.x_shape, DT_INT32, p.x, p.x_shape);
+    } else {
+      AddTestWeights("x", p.x_shape, p.x, DT_INT32);
+    }
+
+    TestOpConverter("my_dfvp", node_def, p.x_shape, p.conversion_status,
+                    Status::OK(), ElementsAreArray(p.expected_output));
+  }
+}
+
 TEST_P(OpConverter_FP32_FP16_INT32_Test, ConvertGather) {
   // Get the NodeDef for GatherV2.
   Scope s = Scope::NewRootScope();
diff --git a/tensorflow/compiler/tf2tensorrt/convert/ops/data_format_vec_permute.cc b/tensorflow/compiler/tf2tensorrt/convert/ops/data_format_vec_permute.cc
new file mode 100644
index 00000000000000..773c771e5c4b15
--- /dev/null
+++ b/tensorflow/compiler/tf2tensorrt/convert/ops/data_format_vec_permute.cc
@@ -0,0 +1,185 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#if GOOGLE_CUDA && GOOGLE_TENSORRT
+#include "tensorflow/compiler/tf2tensorrt/common/utils.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/op_converter_registry.h"
+#include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "third_party/tensorrt/NvInfer.h"
+#include "third_party/tensorrt/NvInferRuntimeCommon.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+int get_spatial_dim_count(string format) {
+  // Spatial dimensions are the dimensions besides NC, and here we assume NC
+  // always appear in the format string.
+  return format.size() - 2;
+}
+
+class ConvertDataFormatVecPermute
+    : public OpConverterBase<ConvertDataFormatVecPermute> {
+ public:
+  ConvertDataFormatVecPermute(OpConverterParams* params)
+      : OpConverterBase<ConvertDataFormatVecPermute>(params) {}
+
+  struct DataFormatVecPermuteAttributes {
+    string dst_format;
+    string src_format;
+    int x_dim_count;
+  };
+
+  static constexpr std::array<InputArgSpec, 1> InputSpec() {
+    return {InputArgSpec::Create("x", TrtInputArg::kBoth)};
+  }
+
+  static constexpr std::array<DataType, 1> AllowedDataTypes() {
+    return {DataType::DT_INT32};
+  }
+
+  Status Validate() {
+    const auto& inputs = params_->inputs;
+    const auto& node_def = params_->node_def;
+
+    if (params_->use_implicit_batch) {
+      return errors::Unimplemented("Implicit batch mode not supported, at ",
+                                   node_def.name());
+    }
+
+    x_input_ = inputs.at(0);
+
+    // Check input rank.
+    const auto x_dims = x_input_.GetTrtDims();
+    int input_rank = x_dims.nbDims;
+    if (input_rank != 1 && input_rank != 2) {
+      return errors::InvalidArgument(
+          "Input must be a vector or matrix, but got rank ", input_rank,
+          ", at ", node_def.name());
+    }
+
+    // Verify and consume node attributes.
+    StatusOr<string> dst_format = GetAttrValue<string>("dst_format");
+    StatusOr<string> src_format = GetAttrValue<string>("src_format");
+    TRT_ENSURE_OK(dst_format);
+    TRT_ENSURE_OK(src_format);
+
+    // Check input dims.
+    const int full_dim_count = src_format->size();
+    const int spatial_dim_count = get_spatial_dim_count(*src_format);
+    if (input_rank == 1) {
+      if (x_dims.d[0] != spatial_dim_count && x_dims.d[0] != full_dim_count) {
+        return errors::InvalidArgument("1D input must be of size ",
+                                       spatial_dim_count, " or ",
+                                       full_dim_count, ", but got size ",
+                                       x_dims.d[0], ", at ", node_def.name());
+      }
+    } else if (input_rank == 2) {
+      if (x_dims.d[0] != spatial_dim_count && x_dims.d[0] != full_dim_count) {
+        return errors::InvalidArgument(
+            "First dimension of 2D input must be of size ", spatial_dim_count,
+            " or ", full_dim_count, ", but got shape (", x_dims.d[0], ", ",
+            x_dims.d[1], "), at ", node_def.name());
+      }
+      if (x_dims.d[1] != 2) {
+        return errors::InvalidArgument(
+            "Second dimension of 2D input must be of size 2, but got shape (",
+            x_dims.d[0], ", ", x_dims.d[1], "), at ", node_def.name());
+      }
+    }
+
+    // Set custom attributes.
+    attrs_.x_dim_count = x_dims.d[0];
+    attrs_.dst_format = *dst_format;
+    attrs_.src_format = *src_format;
+
+    return Status::OK();
+  }
+
+  Status Convert() {
+    const auto& node_def = params_->node_def;
+
+    // Copy format strings in case they need to be modified.
+    string dst_format = attrs_.dst_format;
+    string src_format = attrs_.src_format;
+    const int& spatial_dim_count = get_spatial_dim_count(src_format);
+
+    // If the input is a vector of size spatial_dim_count, treat the elements
+    // as spatial dimensions.
+    if (attrs_.x_dim_count == spatial_dim_count) {
+      auto keep_only_spatial_dimensions =
+          [spatial_dim_count](string* format_str) -> void {
+        auto new_end = std::remove_if(format_str->begin(), format_str->end(),
+                                      [spatial_dim_count](const char dim) {
+                                        return dim == 'N' || dim == 'C';
+                                      });
+        format_str->erase(new_end, format_str->end());
+      };
+      keep_only_spatial_dimensions(&src_format);
+      keep_only_spatial_dimensions(&dst_format);
+    }
+
+    // Create indices for the gather layer and make weights out of them.
+    std::vector<int32> dst_indices(attrs_.x_dim_count);
+    for (int i = 0; i < attrs_.x_dim_count; ++i) {
+      for (int j = 0; j < attrs_.x_dim_count; ++j) {
+        if (src_format[i] == dst_format[j]) {
+          dst_indices[j] = i;
+          break;
+        }
+      }
+    }
+    nvinfer1::Dims indices_dims = {1, {attrs_.x_dim_count}};
+    auto indices_weights = params_->weight_store->GetTempWeights(
+        nvinfer1::DataType::kINT32, indices_dims);
+    int32* indices_ptr = indices_weights.GetPointer<int32>();
+    std::copy(dst_indices.data(), dst_indices.data() + attrs_.x_dim_count,
+              indices_ptr);
+
+    ITensorProxyPtr x_tensor =
+        x_input_.is_weights() ? params_->converter->CreateConstantLayer(
+                                    x_input_.weights(), x_input_.GetTrtDims())
+                              : x_input_.tensor();
+    ITensorProxyPtr indices_tensor =
+        params_->converter->CreateConstantLayer(indices_weights, indices_dims);
+
+    // Gather layer with 1D indices on axis 0, conserves shape.
+    nvinfer1::IGatherLayer* layer = params_->converter->network()->addGather(
+        *x_tensor->trt_tensor(), *indices_tensor->trt_tensor(), 0);
+    TRT_ENSURE(layer);
+    params_->converter->SetLayerName(layer, node_def);
+
+    ITensorProxyPtr output_tensor = layer->getOutput(0);
+
+    params_->outputs->push_back(TRT_TensorOrWeights(output_tensor));
+    return Status::OK();
+  }
+
+ private:
+  TRT_TensorOrWeights x_input_;
+  DataFormatVecPermuteAttributes attrs_{};
+};
+REGISTER_DEFAULT_TRT_OP_CONVERTER(
+    MakeConverterFunction<ConvertDataFormatVecPermute>(),
+    {"DataFormatVecPermute"});
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA && GOOGLE_TENSORRT
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.cc b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
index 50f9841dfb0b80..407e6389aab836 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
@@ -75,7 +75,11 @@ string DebugString(const nvinfer1::Permutation& permutation, int len) {
 }
 
 string DebugString(const ITensorProxyPtr& tensor) {
-  return DebugString(*tensor->trt_tensor());
+  return StrCat(
+      tensor->is_trt_tensor() ? "nvinfer1::ITensor(@" : "SimpleItensor(@",
+      reinterpret_cast<uintptr_t>(&tensor), ", name=", tensor->getName(),
+      ", dtype=", DebugString(tensor->getType()),
+      ", dims=", DebugString(tensor->getDimensions()), ")");
 }
 
 string DebugString(const nvinfer1::ITensor& tensor) {
diff --git a/tensorflow/compiler/tf2tensorrt/segment/segment.cc b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
index 0f2aa0a736ebaf..419d71c377809e 100644
--- a/tensorflow/compiler/tf2tensorrt/segment/segment.cc
+++ b/tensorflow/compiler/tf2tensorrt/segment/segment.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <map>
 #include <queue>
+#include <tuple>
 #include <unordered_map>
 #include <utility>
 
@@ -680,8 +681,8 @@ void AddSegmentForNode(const grappler::GraphProperties* graph_properties,
 
 }  // namespace
 
-string GenerateUnconversionReport(
-    std::map<string, std::map<string, int>>& unconverted_ops_map) {
+string GenerateNonConversionReport(
+    std::map<string, std::map<string, int>>& nonconverted_ops_map) {
   // Fetch whether to print a detailed version of the TF-TRT conversion report.
   bool show_detailed_conversion_report;
   TF_CHECK_OK(ReadBoolFromEnvVar("TF_TRT_SHOW_DETAILED_REPORT",
@@ -690,60 +691,62 @@ string GenerateUnconversionReport(
 
   string unsupported_op_report =
       StrCat("\n\n", string(80, '#'), "\n",
-             "TensorRT unsupported/unconverted OP Report:");
-  int total_unconverted_ops{0};
+             "TensorRT unsupported/non-converted OP Report:");
+  int total_nonconverted_ops{0};
 
-  using OPCounterVector = std::vector<std::pair<string, int>>;
+  // <Reason, Count for this reason>
+  using ReasonCounterVector = std::vector<std::pair<string, int>>;
+  // <OP Name, Total Non-Converted for OP, <Reason, Count for this reason>>>
+  using NotConvertedOPTuple = std::tuple<string, int, ReasonCounterVector>;
 
-  // Copying map data into a vector for descending sorting.
-  // <OP Name, <Total Unconverted for OP, <Reason, Total for this reason>>>
-  std::vector<std::pair<std::string, std::pair<int, OPCounterVector>>>
-      unconverted_ops_vec;
+  std::vector<NotConvertedOPTuple> nonconverted_ops_vec;
 
   // Populate the vector from the map
-  for (auto& it1 : unconverted_ops_map) {
-    int total_unconverted_op{0};
-    OPCounterVector reason_occurances_vect;
+  for (auto& nonconverted_op_data : nonconverted_ops_map) {
+    int total_nonconverted_op{0};
+    ReasonCounterVector reason_occurances_vect;
 
-    for (auto& it2 : it1.second) {
-      total_unconverted_op += it2.second;
-      reason_occurances_vect.push_back(it2);
+    auto op_name = nonconverted_op_data.first;
+    auto op_data = nonconverted_op_data.second;
+
+    for (auto& notconversion_reason_data : op_data) {
+      auto reason_count = notconversion_reason_data.second;
+      total_nonconverted_op += reason_count;
+      reason_occurances_vect.push_back(notconversion_reason_data);
     }
 
     // Sort in descending number of occurances for the reasons why a given
     // TensorFlow OP was not converted.
     std::sort(reason_occurances_vect.begin(), reason_occurances_vect.end(),
-              [](const std::pair<std::string, int>& a,
-                 const std::pair<std::string, int>& b) -> bool {
+              [](const std::pair<string, int>& a,
+                 const std::pair<string, int>& b) -> bool {
                 return a.second > b.second;
               });
 
-    unconverted_ops_vec.push_back(std::make_pair(
-        it1.first,
-        std::make_pair(total_unconverted_op, reason_occurances_vect)));
+    nonconverted_ops_vec.push_back(std::make_tuple(
+        op_name, total_nonconverted_op, reason_occurances_vect));
   }
 
   // Sort the vector by descending OP names.
-  std::sort(
-      unconverted_ops_vec.begin(), unconverted_ops_vec.end(),
-      [](const std::pair<std::string, std::pair<int, OPCounterVector>>& a,
-         const std::pair<std::string, std::pair<int, OPCounterVector>>& b) {
-        return a.second.first > b.second.first;
-      });
-
-  for (auto& it1 : unconverted_ops_vec) {
-    auto& op_name = it1.first;
-    auto& op_total_unconverted = it1.second.first;
-    total_unconverted_ops += op_total_unconverted;
+  std::sort(nonconverted_ops_vec.begin(), nonconverted_ops_vec.end(),
+            [](const NotConvertedOPTuple& a, const NotConvertedOPTuple& b) {
+              return std::get<1>(a) > std::get<1>(b);
+            });
+
+  for (auto& notconverted_op_detail : nonconverted_ops_vec) {
+    auto& op_name = std::get<0>(notconverted_op_detail);
+    auto& op_total_nonconverted = std::get<1>(notconverted_op_detail);
+    total_nonconverted_ops += op_total_nonconverted;
 
     unsupported_op_report = StrCat(unsupported_op_report, "\n\t- ", op_name,
-                                   " -> ", op_total_unconverted, "x");
+                                   " -> ", op_total_nonconverted, "x");
 
     if (show_detailed_conversion_report) {
-      auto& op_unconversion_details = it1.second.second;
-      for (auto& it2 : op_unconversion_details) {
-        auto& reason = it2.first;
-        auto& reason_count = it2.second;
+      auto& nonconverted_ops_details = std::get<2>(notconverted_op_detail);
+
+      for (auto& nonconversion_details : nonconverted_ops_details) {
+        auto& reason = nonconversion_details.first;
+        auto& reason_count = nonconversion_details.second;
         if (reason_count == 0) {
           continue;
         }
@@ -757,8 +760,8 @@ string GenerateUnconversionReport(
 
   unsupported_op_report =
       StrCat(unsupported_op_report, "\n", string(80, '-'),
-             "\n\t- Total unconverted OPs: ", total_unconverted_ops,
-             "\n\t- Total unconverted OP Types: ", unconverted_ops_map.size(),
+             "\n\t- Total nonconverted OPs: ", total_nonconverted_ops,
+             "\n\t- Total nonconverted OP Types: ", nonconverted_ops_map.size(),
              "\nFor more information see https://docs.nvidia.com/deeplearning",
              "/frameworks/tf-trt-user-guide/index.html#supported-ops.", "\n",
              string(80, '#'), "\n");
@@ -815,7 +818,7 @@ Status SegmentGraph(const Graph* tf_graph,
   // segment. A node value of nullptr indicates that the node is not a candidate
   // for TRT.
 
-  std::map<string, std::map<string, int>> unconverted_ops_map = {};
+  std::map<string, std::map<string, int>> nonconverted_ops_map = {};
 
   // Parsing each node of the graph
   std::vector<UnionFind<SimpleNode*>> node_segments;
@@ -834,7 +837,7 @@ Status SegmentGraph(const Graph* tf_graph,
               << "(Op type: " << node_op_type << "), "
               << "(Op name: " << node->name() << "), "
               << "(Reason: " << reason << ")";
-      unconverted_ops_map[node_op_type][std::string(reason)]++;
+      nonconverted_ops_map[node_op_type][string(reason)]++;
       node = nullptr;
     };
     absl::optional<DeviceNameUtils::ParsedName> device_name =
@@ -878,7 +881,7 @@ Status SegmentGraph(const Graph* tf_graph,
                       options.use_implicit_batch);
   }
 
-  LOG(WARNING) << GenerateUnconversionReport(unconverted_ops_map);
+  LOG(WARNING) << GenerateNonConversionReport(nonconverted_ops_map);
 
   // The segmentation algorithm below visits nodes in reverse topological order
   // and attempts to merge nodes along output edges. That means that subgraphs
diff --git a/tensorflow/compiler/tf2xla/kernels/where_op.cc b/tensorflow/compiler/tf2xla/kernels/where_op.cc
index 0a67de8b09d99d..8f8c4e5c95e171 100644
--- a/tensorflow/compiler/tf2xla/kernels/where_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/where_op.cc
@@ -78,12 +78,6 @@ class WhereOp : public XlaOpKernel {
     xla::XlaOp length = xla::ReduceAll(
         compared_int, xla::Zero(ctx->builder(), xla::S32),
         xla::CreateScalarAddComputation(xla::S32, ctx->builder()));
-    StatusOr<xla::XlaOp> rebounded_result = xla::SetDimensionSizeWithRebound(
-        &ctx->value_inference(), result, length, 0);
-    if (rebounded_result.ok()) {
-      ctx->SetOutput(0, *rebounded_result);
-      return;
-    }
     // TODO(b/207187072): Remove special handling once dynamic reshape can also
     // be handled.
     xla::XlaOp bounded_result = xla::SetDimensionSize(result, length, 0);
@@ -91,7 +85,7 @@ class WhereOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("Where").Device(DEVICE_TPU_XLA_JIT), WhereOp);
+REGISTER_XLA_OP(Name("Where"), WhereOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc
index ea0e506625ce55..8badf38bb9931c 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_conv_op.cc
@@ -40,6 +40,7 @@ class XlaConvOp : public XlaOpKernel {
                 precision_config_.ParsePartialFromString(precision_config_attr),
                 errors::InvalidArgument("Error parsing precision config."));
     preferred_element_type_ = absl::nullopt;
+    batch_group_count_ = 1;
   }
 
   void Compile(XlaOpKernelContext* context) override {
@@ -79,12 +80,13 @@ class XlaConvOp : public XlaOpKernel {
     xla::XlaOp output = xla::ConvGeneralDilated(
         context->Input(0), context->Input(1), window_strides, padding,
         lhs_dilation, rhs_dilation, dnums_, feature_group_count,
-        /*batch_group_count=*/1, &precision_config_, preferred_element_type_);
+        batch_group_count_, &precision_config_, preferred_element_type_);
     context->SetOutput(0, output);
   }
 
  protected:
   absl::optional<xla::PrimitiveType> preferred_element_type_;
+  int64_t batch_group_count_;
 
  private:
   xla::ConvolutionDimensionNumbers dnums_;
@@ -111,6 +113,9 @@ class XlaConvV2Op : public XlaConvOp {
     OP_REQUIRES_OK(context, DataTypeToPrimitiveType(preferred_element_dtype,
                                                     &preferred_element_type));
     preferred_element_type_ = preferred_element_type;
+
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("batch_group_count", &batch_group_count_));
   }
 
  private:
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_reduce_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_reduce_op.cc
index c0e70c706b48f4..c2af7888215494 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_reduce_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_reduce_op.cc
@@ -132,7 +132,7 @@ class XlaReduceOp : public XlaOpKernel {
 };
 
 REGISTER_XLA_OP(Name("XlaReduce"), XlaReduceOp);
-REGISTER_XLA_OP(Name("XlaVariadicReduce"), XlaReduceOp);
+REGISTER_XLA_OP(Name("XlaVariadicReduce"), MlirXlaOpKernel);
 REGISTER_XLA_OP(Name("XlaVariadicReduceV2"), MlirXlaOpKernel);
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index 9f611ec0a86f6d..6a89be5d99664c 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -180,6 +180,7 @@ REGISTER_OP("XlaConvV2")
     .Attr("dimension_numbers: string")
     .Attr("precision_config: string")
     .Attr("preferred_element_type: numbertype")
+    .Attr("batch_group_count: int = 1")
     .Output("output: preferred_element_type")
     .SetShapeFn(UnchangedRank)
     .Doc(R"doc(
@@ -187,16 +188,17 @@ Wraps the XLA ConvGeneralDilated operator, documented at
  https://www.tensorflow.org/performance/xla/operation_semantics#conv_convolution
 .
 
-lhs: the input tensor
-rhs: the kernel tensor
-window_strides: the inter-window strides
-padding: the padding to apply at the start and end of each input dimensions
+lhs: input tensor
+rhs: kernel tensor
+window_strides: inter-window strides
+padding: padding to apply at the start and end of each input dimensions
 lhs_dilation: dilation to apply between input elements
 rhs_dilation: dilation to apply between kernel elements
 feature_group_count: number of feature groups for grouped convolution.
-dimension_numbers: a serialized xla::ConvolutionDimensionNumbers proto.
-precision_config: a serialized xla::PrecisionConfig proto.
-preferred_element_type: The type of the tensor.
+dimension_numbers: serialized xla::ConvolutionDimensionNumbers proto.
+precision_config: serialized xla::PrecisionConfig proto.
+preferred_element_type: type of the tensor.
+batch_group_count: number of batch groups or grouped filters.
 )doc");
 
 static Status XlaDotShapeFunction(shape_inference::InferenceContext* c) {
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index 68a12765419c49..ccd48cb68b9868 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -250,7 +250,8 @@ def conv(lhs,
          precision_config=None,
          preferred_element_type=None,
          name=None,
-         use_v2=False):
+         use_v2=False,
+         batch_group_count=1):
   """Wraps the XLA ConvGeneralDilated operator.
 
   ConvGeneralDilated is the most general form of XLA convolution and is
@@ -270,6 +271,7 @@ def conv(lhs,
     preferred_element_type: the result `dtype`.
     name: an optional name for the operator.
     use_v2: an optional request to use the XlaConvV2 op even if not necessary.
+    batch_group_count: number of batch groups or grouped filters.
 
   Returns:
     A tensor representing the output of the convolution.
@@ -277,7 +279,9 @@ def conv(lhs,
   precision_config_proto = ""
   if precision_config:
     precision_config_proto = precision_config.SerializeToString()
-  needs_v2 = preferred_element_type or (lhs.dtype != rhs.dtype)
+  needs_v2 = (
+      preferred_element_type or (lhs.dtype != rhs.dtype) or
+      batch_group_count > 1)
   if preferred_element_type is None:
     preferred_element_type = np_utils.result_type(lhs.dtype, rhs.dtype)
   if needs_v2 or use_v2:
@@ -289,6 +293,7 @@ def conv(lhs,
         lhs_dilation=lhs_dilation,
         rhs_dilation=rhs_dilation,
         feature_group_count=feature_group_count,
+        batch_group_count=batch_group_count,
         dimension_numbers=dimension_numbers.SerializeToString(),
         precision_config=precision_config_proto,
         preferred_element_type=preferred_element_type,
diff --git a/tensorflow/compiler/xla/client/lib/approx_topk.cc b/tensorflow/compiler/xla/client/lib/approx_topk.cc
index 0ed4f00fec174a..40cd48a704c9ba 100644
--- a/tensorflow/compiler/xla/client/lib/approx_topk.cc
+++ b/tensorflow/compiler/xla/client/lib/approx_topk.cc
@@ -260,6 +260,12 @@ StatusOr<std::pair<int64_t, int64_t>> ApproxTopKReductionOutputSize(
     return InvalidArgument("recall_target should range in (0,1]");
   }
 
+  // Need to handle 1.0 explicitly, otherwise we would encounter division by
+  // log(1.0) = 0 issue.
+  if (recall_target == 1.0) {
+    return std::pair<int64_t, int64_t>(input_size, 0);
+  }
+
   if (input_size_override >= 0) {
     if (input_size > input_size_override) {
       return InvalidArgument(
@@ -286,8 +292,10 @@ StatusOr<std::pair<int64_t, int64_t>> ApproxTopKReductionOutputSize(
   //
   //   => M = (1 - K)/LOG(recall)
   uint64_t m = std::min<uint64_t>(
-      std::max(static_cast<uint64_t>((1.0 - top_k) / std::log(recall_target)),
-               tpu_tiling),
+      std::max(
+          static_cast<uint64_t>((1.0 - top_k) /
+                                std::log(static_cast<double>(recall_target))),
+          tpu_tiling),
       input_size);
   uint32_t log2_reduction = log2_floor(logical_input_size / m);
   if (log2_reduction == 0) {
diff --git a/tensorflow/compiler/xla/client/value_inference.cc b/tensorflow/compiler/xla/client/value_inference.cc
index 28d3d9b950ceb2..21b80d18e18248 100644
--- a/tensorflow/compiler/xla/client/value_inference.cc
+++ b/tensorflow/compiler/xla/client/value_inference.cc
@@ -103,8 +103,9 @@ Literal CreateGarbageLiteral(const Shape& reference_shape) {
 // HloProtoEvaluator evaluates an hlo proto and returns a literal. The user has
 // to provide operand as literals through the get_operand function.
 struct HloProtoEvaluator {
-  explicit HloProtoEvaluator(HloInstructionProto inst)
-      : inst(std::move(inst)),
+  explicit HloProtoEvaluator(HloEvaluator& evaluator, HloInstructionProto inst)
+      : evaluator(evaluator),
+        inst(std::move(inst)),
         module("EmptyModuleForEvaluation", HloModuleConfig()) {}
 
   // WithOpCode changes the called computation of the instruction being
@@ -181,7 +182,6 @@ struct HloProtoEvaluator {
     builder.AddInstruction(std::move(new_instruction));
     auto computation = builder.Build();
     module.AddEntryComputation(std::move(computation));
-    HloEvaluator evaluator;
     if (shape_index.empty()) {
       return evaluator.Evaluate(module.entry_computation()->root_instruction());
     } else {
@@ -192,6 +192,7 @@ struct HloProtoEvaluator {
     }
   }
 
+  HloEvaluator& evaluator;
   HloInstructionProto inst;
 
   HloModule module;
@@ -334,9 +335,11 @@ using HandleToInstruction =
 using HandleToComputation = std::function<const HloComputationProto*(int64_t)>;
 
 struct PostorderDFSVisitor {
-  PostorderDFSVisitor(HandleToInstruction handle_to_instruction,
+  PostorderDFSVisitor(HloEvaluator& evaluator,
+                      HandleToInstruction handle_to_instruction,
                       HandleToComputation handle_to_computation)
-      : handle_to_instruction(handle_to_instruction),
+      : evaluator(evaluator),
+        handle_to_instruction(handle_to_instruction),
         handle_to_computation(handle_to_computation) {}
 
   StatusOr<PostorderDFSNode> AnalyzeUpperBound(int64_t handle,
@@ -436,6 +439,7 @@ struct PostorderDFSVisitor {
     }
   };
 
+  HloEvaluator& evaluator;
   absl::flat_hash_map<CacheKey, Literal> evaluated;
   HandleToInstruction handle_to_instruction;
   HandleToComputation handle_to_computation;
@@ -599,12 +603,12 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstantValueFallback(
       const HloComputationProto* computation_proto =
           handle_to_computation(root->called_computation_ids(0));
       return result.AddVisit(
-          [root, computation_proto,
-           context](absl::Span<Literal> operands) -> StatusOr<Literal> {
+          [root, computation_proto, context,
+           this](absl::Span<Literal> operands) -> StatusOr<Literal> {
             TF_ASSIGN_OR_RETURN(
                 auto computation,
                 HloComputation::CreateFromProto(*computation_proto, {}));
-            return HloProtoEvaluator(*root)
+            return HloProtoEvaluator(evaluator, *root)
                 .WithOperands(operands)
                 .WithComputation(std::move(computation))
                 .WithSubshape(context.shape_index)
@@ -623,8 +627,10 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstantValueFallback(
                            tuple_operand_context)
             .AddVisit([](Literal operand) { return operand; });
       }
-      return result.AddVisit([root](absl::Span<Literal> operands) {
-        return HloProtoEvaluator(*root).WithOperands(operands).Evaluate();
+      return result.AddVisit([root, this](absl::Span<Literal> operands) {
+        return HloProtoEvaluator(evaluator, *root)
+            .WithOperands(operands)
+            .Evaluate();
       });
     }
   }
@@ -662,9 +668,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
                          PostorderDFSNodeType::kConstantLowerBound, context)
           .AddDependency(root->operand_ids(0),
                          PostorderDFSNodeType::kConstantUpperBound, context)
-          .AddVisit([](Literal lower_bound,
-                       Literal upper_bound) -> StatusOr<Literal> {
-            HloEvaluator evaluator;
+          .AddVisit([this](Literal lower_bound,
+                           Literal upper_bound) -> StatusOr<Literal> {
             TF_ASSIGN_OR_RETURN(auto lower_bound_abs,
                                 evaluator.EvaluateElementwiseUnaryOp(
                                     HloOpcode::kAbs, lower_bound));
@@ -717,8 +722,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
       return PostorderDFSNode()
           .AddDependency(root->operand_ids(0),
                          PostorderDFSNodeType::kConstantLowerBound, context)
-          .AddVisit([](Literal lower_bound) -> StatusOr<Literal> {
-            HloEvaluator evaluator;
+          .AddVisit([this](Literal lower_bound) -> StatusOr<Literal> {
             return evaluator.EvaluateElementwiseUnaryOp(HloOpcode::kNegate,
                                                         lower_bound);
           });
@@ -741,7 +745,6 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
               // at compile time, we set the bound defer the check to
               // runtime. In those cases we use the upper-bound of
               // first operand as a placeholder.
-              HloEvaluator evaluator;
               auto zero = LiteralUtil::Zero(lower_bound.shape().element_type());
               zero = zero.Broadcast(lower_bound.shape(), {}).ValueOrDie();
               TF_ASSIGN_OR_RETURN(
@@ -759,7 +762,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
             std::vector<Literal> new_operands;
             new_operands.emplace_back(std::move(upper_bound));
             new_operands.emplace_back(std::move(lower_bound));
-            return HloProtoEvaluator(*root)
+            return HloProtoEvaluator(evaluator, *root)
                 .WithOperands(absl::MakeSpan(new_operands))
                 .Evaluate();
           });
@@ -826,9 +829,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeLowerBound(
                          PostorderDFSNodeType::kConstantLowerBound, context)
           .AddDependency(root->operand_ids(0),
                          PostorderDFSNodeType::kConstantUpperBound, context)
-          .AddVisit([](Literal lower_bound,
-                       Literal upper_bound) -> StatusOr<Literal> {
-            HloEvaluator evaluator;
+          .AddVisit([this](Literal lower_bound,
+                           Literal upper_bound) -> StatusOr<Literal> {
             TF_ASSIGN_OR_RETURN(auto lower_bound_abs,
                                 evaluator.EvaluateElementwiseUnaryOp(
                                     HloOpcode::kAbs, lower_bound));
@@ -844,8 +846,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeLowerBound(
       return PostorderDFSNode()
           .AddDependency(root->operand_ids(0),
                          PostorderDFSNodeType::kConstantUpperBound, context)
-          .AddVisit([](Literal upper_bound) -> StatusOr<Literal> {
-            HloEvaluator evaluator;
+          .AddVisit([this](Literal upper_bound) -> StatusOr<Literal> {
             return evaluator.EvaluateElementwiseUnaryOp(HloOpcode::kNegate,
                                                         upper_bound);
           });
@@ -858,9 +859,12 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeLowerBound(
                          PostorderDFSNodeType::kConstantLowerBound, context)
           .AddDependency(root->operand_ids(1),
                          PostorderDFSNodeType::kConstantUpperBound, context)
-          .AddVisit([root](absl::Span<Literal> operands) -> StatusOr<Literal> {
-            return HloProtoEvaluator(*root).WithOperands(operands).Evaluate();
-          });
+          .AddVisit(
+              [root, this](absl::Span<Literal> operands) -> StatusOr<Literal> {
+                return HloProtoEvaluator(evaluator, *root)
+                    .WithOperands(operands)
+                    .Evaluate();
+              });
     }
     default:
       return AnalyzeConstantValueFallback(
@@ -909,8 +913,10 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstant(
                              context);
       }
       return result.AddVisit(
-          [root](absl::Span<Literal> operands) -> StatusOr<Literal> {
-            return HloProtoEvaluator(*root).WithOperands(operands).Evaluate();
+          [root, this](absl::Span<Literal> operands) -> StatusOr<Literal> {
+            return HloProtoEvaluator(evaluator, *root)
+                .WithOperands(operands)
+                .Evaluate();
           });
     }
     case HloOpcode::kCustomCall: {
@@ -948,12 +954,12 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstant(
       const HloComputationProto* computation_proto =
           handle_to_computation(root->called_computation_ids(0));
       return result.AddVisit(
-          [root, context, computation_proto](
-              absl::Span<Literal> operands) -> StatusOr<Literal> {
+          [root, context, computation_proto,
+           this](absl::Span<Literal> operands) -> StatusOr<Literal> {
             TF_ASSIGN_OR_RETURN(
                 auto computation,
                 HloComputation::CreateFromProto(*computation_proto, {}));
-            return HloProtoEvaluator(*root)
+            return HloProtoEvaluator(evaluator, *root)
                 .WithOperands(operands)
                 .WithComputation(std::move(computation))
                 .WithSubshape(context.shape_index)
@@ -1115,8 +1121,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:
     case HloOpcode::kShiftRightLogical: {
-      return result.AddVisit([root](absl::Span<Literal> operands) {
-        return HloProtoEvaluator(*root)
+      return result.AddVisit([root, this](absl::Span<Literal> operands) {
+        return HloProtoEvaluator(evaluator, *root)
             .WithOperands(operands)
             .WithPrimitiveType(PRED)
             .WithOpCode(HloOpcode::kOr)
@@ -1142,8 +1148,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
                            tuple_operand_context)
             .AddVisit([](Literal operand) { return operand; });
       }
-      return result.AddVisit([root](absl::Span<Literal> operands) {
-        return HloProtoEvaluator(*root)
+      return result.AddVisit([root, this](absl::Span<Literal> operands) {
+        return HloProtoEvaluator(evaluator, *root)
             .WithOperands(operands)
             .WithPrimitiveType(PRED)
             .Evaluate();
@@ -1266,56 +1272,57 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
     }
 
     case HloOpcode::kReduce: {
-      return result.AddVisit([root, context](absl::Span<Literal> operands) {
-        Shape root_shape = Shape(root->shape());
-        Shape scalar_shape = ShapeUtil::MakeScalarShape(xla::PRED);
-        std::unique_ptr<HloComputation> reduce_or;
-        if (root_shape.IsTuple()) {
-          // Variadic reduce.
-          HloComputation::Builder b("reduce_or");
-          // Assuming all operands interact with each other. This could be
-          // overly conservative.  If needed, a dataflow analysis could be
-          // performed in the future.
-          //
-          // The value starts with `false` (static) and will be `or`ed with all
-          // operands's dynamism.
-          auto accum = b.AddInstruction(HloInstruction::CreateConstant(
-              LiteralUtil::CreateR0<bool>(false)));
-
-          for (int i = 0; i < root_shape.tuple_shapes_size(); ++i) {
-            auto lhs = b.AddInstruction(
-                HloInstruction::CreateParameter(i, scalar_shape, "lhs"));
-            auto rhs = b.AddInstruction(HloInstruction::CreateParameter(
-                i + root_shape.tuple_shapes_size(), scalar_shape, "rhs"));
-            accum = b.AddInstruction(HloInstruction::CreateBinary(
-                scalar_shape, HloOpcode::kOr, accum, lhs));
-            accum = b.AddInstruction(HloInstruction::CreateBinary(
-                scalar_shape, HloOpcode::kOr, accum, rhs));
-          }
-          // `Broadcast` the result to all positions in the result.
-          std::vector<HloInstruction*> results(root_shape.tuple_shapes_size(),
-                                               accum);
-          b.AddInstruction(HloInstruction::CreateTuple(results));
-          reduce_or = b.Build();
-        } else {
-          HloComputation::Builder b("reduce_or");
-          auto lhs = b.AddInstruction(
-              HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
-          auto rhs = b.AddInstruction(
-              HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
-          b.AddInstruction(HloInstruction::CreateBinary(
-              scalar_shape, HloOpcode::kOr, lhs, rhs));
-          reduce_or = b.Build();
-        }
+      return result.AddVisit(
+          [root, context, this](absl::Span<Literal> operands) {
+            Shape root_shape = Shape(root->shape());
+            Shape scalar_shape = ShapeUtil::MakeScalarShape(xla::PRED);
+            std::unique_ptr<HloComputation> reduce_or;
+            if (root_shape.IsTuple()) {
+              // Variadic reduce.
+              HloComputation::Builder b("reduce_or");
+              // Assuming all operands interact with each other. This could be
+              // overly conservative.  If needed, a dataflow analysis could be
+              // performed in the future.
+              //
+              // The value starts with `false` (static) and will be `or`ed with
+              // all operands's dynamism.
+              auto accum = b.AddInstruction(HloInstruction::CreateConstant(
+                  LiteralUtil::CreateR0<bool>(false)));
+
+              for (int i = 0; i < root_shape.tuple_shapes_size(); ++i) {
+                auto lhs = b.AddInstruction(
+                    HloInstruction::CreateParameter(i, scalar_shape, "lhs"));
+                auto rhs = b.AddInstruction(HloInstruction::CreateParameter(
+                    i + root_shape.tuple_shapes_size(), scalar_shape, "rhs"));
+                accum = b.AddInstruction(HloInstruction::CreateBinary(
+                    scalar_shape, HloOpcode::kOr, accum, lhs));
+                accum = b.AddInstruction(HloInstruction::CreateBinary(
+                    scalar_shape, HloOpcode::kOr, accum, rhs));
+              }
+              // `Broadcast` the result to all positions in the result.
+              std::vector<HloInstruction*> results(
+                  root_shape.tuple_shapes_size(), accum);
+              b.AddInstruction(HloInstruction::CreateTuple(results));
+              reduce_or = b.Build();
+            } else {
+              HloComputation::Builder b("reduce_or");
+              auto lhs = b.AddInstruction(
+                  HloInstruction::CreateParameter(0, scalar_shape, "lhs"));
+              auto rhs = b.AddInstruction(
+                  HloInstruction::CreateParameter(1, scalar_shape, "rhs"));
+              b.AddInstruction(HloInstruction::CreateBinary(
+                  scalar_shape, HloOpcode::kOr, lhs, rhs));
+              reduce_or = b.Build();
+            }
 
-        return HloProtoEvaluator(*root)
-            .WithOperands(operands)
-            .WithPrimitiveType(PRED)
-            .WithComputation(std::move(reduce_or))
-            // Reduce could produce tuple shape, only fetch what we need.
-            .WithSubshape(context.shape_index)
-            .Evaluate();
-      });
+            return HloProtoEvaluator(evaluator, *root)
+                .WithOperands(operands)
+                .WithPrimitiveType(PRED)
+                .WithComputation(std::move(reduce_or))
+                // Reduce could produce tuple shape, only fetch what we need.
+                .WithSubshape(context.shape_index)
+                .Evaluate();
+          });
     }
     case HloOpcode::kConstant:
     case HloOpcode::kIota: {
@@ -1382,24 +1389,25 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
                          PostorderDFSNodeType::kConstantValue, context)
           .AddDependency(root->operand_ids(1),
                          PostorderDFSNodeType::kValueIsDynamic, context)
-          .AddVisit([root](absl::Span<Literal> operands) -> StatusOr<Literal> {
-            OptionalLiteral optional_selector_literal(std::move(operands[1]),
-                                                      std::move(operands[2]));
-
-            if (!optional_selector_literal.AllValid()) {
-              // Conservatively assume results are dynamic.
-              return CreatePredLiteral(true, Shape(root->shape()));
-            }
-            std::vector<Literal> new_operands;
-            new_operands.emplace_back(std::move(operands[0]));
-            new_operands.emplace_back(
-                optional_selector_literal.GetValue()->Clone());
+          .AddVisit(
+              [root, this](absl::Span<Literal> operands) -> StatusOr<Literal> {
+                OptionalLiteral optional_selector_literal(
+                    std::move(operands[1]), std::move(operands[2]));
+
+                if (!optional_selector_literal.AllValid()) {
+                  // Conservatively assume results are dynamic.
+                  return CreatePredLiteral(true, Shape(root->shape()));
+                }
+                std::vector<Literal> new_operands;
+                new_operands.emplace_back(std::move(operands[0]));
+                new_operands.emplace_back(
+                    optional_selector_literal.GetValue()->Clone());
 
-            return HloProtoEvaluator(*root)
-                .WithOperands(absl::MakeSpan(new_operands))
-                .WithPrimitiveType(PRED)
-                .Evaluate();
-          });
+                return HloProtoEvaluator(evaluator, *root)
+                    .WithOperands(absl::MakeSpan(new_operands))
+                    .WithPrimitiveType(PRED)
+                    .Evaluate();
+              });
     }
     case HloOpcode::kCustomCall: {
       if (root->custom_call_target() == "SetBound") {
@@ -1474,7 +1482,7 @@ StatusOr<Literal> PostorderDFSVisitor::PostOrderDFSVisit(
     VisitState state;
     Visit visit;  // The handler to call once the dependencies are resolved into
                   // literal form.
-    int64_t id;     // Unique id in the work queue, starting from 0.
+    int64_t id;   // Unique id in the work queue, starting from 0.
     std::vector<CacheKey> dependencies;
 
     CacheKey GetCacheKey() { return CacheKey(handle, context, type); }
@@ -1568,6 +1576,7 @@ StatusOr<Literal> PostorderDFSVisitor::PostOrderDFSVisit(
 
 StatusOr<Literal> ValueInference::AnalyzeIsDynamic(XlaOp op) {
   PostorderDFSVisitor visitor(
+      evaluator_,
       [&](int64_t handle) {
         return builder_->LookUpInstructionByHandle(handle);
       },
@@ -1632,7 +1641,7 @@ StatusOr<Literal> ValueInference::SimplifyOp(int64_t handle) {
       }
       // We put handles into the tensor and evaluate the results into a literal.
       // The literal also contain handles for each element position.
-      return HloProtoEvaluator(*inst)
+      return HloProtoEvaluator(evaluator_, *inst)
           .WithOperands(absl::MakeSpan(operands))
           .WithPrimitiveType(S64)
           .Evaluate();
@@ -1737,6 +1746,7 @@ StatusOr<OptionalLiteral> ValueInference::AnalyzeConstant(
     XlaOp op, ValueInferenceMode mode) {
   TF_RETURN_IF_ERROR(builder_->LookUpInstructionByHandle(op.handle()).status());
   PostorderDFSVisitor visitor(
+      evaluator_,
       [&](int64_t handle) {
         return builder_->LookUpInstructionByHandle(handle);
       },
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index ad0003d7102b63..da8425fc8e9c2a 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -474,4 +474,52 @@ Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) {
   return hash_value;
 }
 
+/*static*/ int64_t LayoutUtil::LinearIndex(const Shape& shape,
+                                           absl::Span<const int64_t> indices) {
+  CHECK(shape.IsArray());
+  CHECK(shape.has_layout());
+  const int rank = shape.rank();
+  CHECK_EQ(rank, indices.size());
+
+  if (rank == 0) {
+    return 0;
+  }
+  if (rank == 1) {
+    return indices[0];
+  }
+
+  Tile tile = {};
+  if (!shape.layout().tiles().empty()) {
+    tile = shape.layout().tiles()[0];
+  }
+
+  int64_t linear_index = 0;
+  int64_t tile_multiplier = 1;
+  // Initialize to number of elements in a tile.
+  for (int64_t i : tile.dimensions()) {
+    tile_multiplier *= i;
+  }
+  int64_t within_tile_multiplier = 1;
+
+  // We only look at the top-level tile.
+  for (int64_t minor = 0; minor < rank; minor++) {
+    int64_t logical_dim = Minor(shape.layout(), minor);
+    int64_t shape_dim_size = shape.dimensions(logical_dim);
+    int64_t index = indices[logical_dim];
+
+    if (minor < tile.dimensions().size()) {
+      int64_t tile_dim_size =
+          tile.dimensions()[tile.dimensions().size() - 1 - minor];
+      linear_index += tile_multiplier * (index / tile_dim_size) +
+                      within_tile_multiplier * (index % tile_dim_size);
+      tile_multiplier *= CeilOfRatio(shape_dim_size, tile_dim_size);
+      within_tile_multiplier *= tile_dim_size;
+    } else {
+      linear_index += index * tile_multiplier;
+      tile_multiplier *= shape_dim_size;
+    }
+  }
+  return linear_index;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/layout_util.h b/tensorflow/compiler/xla/layout_util.h
index e7c3094e4fefe2..8d7a686e7f9da6 100644
--- a/tensorflow/compiler/xla/layout_util.h
+++ b/tensorflow/compiler/xla/layout_util.h
@@ -188,6 +188,14 @@ class LayoutUtil {
   // Compute a hash for `layout`.
   static size_t Hash(const Layout& layout);
 
+  // Returns the linearized index of the cell at the given indices. The unit
+  // of the offset is in elements of the shape.
+  //
+  // NOTE: this method only uses the top-level tile and disregards the sub-tile
+  // in the layout. This method is also performance critical.
+  static int64_t LinearIndex(const Shape& shape,
+                             absl::Span<const int64_t> indices);
+
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(LayoutUtil);
 };
diff --git a/tensorflow/compiler/xla/pjrt/BUILD b/tensorflow/compiler/xla/pjrt/BUILD
index 3ef4bcf124d22a..0fec20649caf94 100644
--- a/tensorflow/compiler/xla/pjrt/BUILD
+++ b/tensorflow/compiler/xla/pjrt/BUILD
@@ -384,6 +384,7 @@ cc_library(
     name = "mlir_to_hlo",
     srcs = ["mlir_to_hlo.cc"],
     hdrs = ["mlir_to_hlo.h"],
+    visibility = [":friends"],
     deps = [
         "//tensorflow/compiler/mlir/hlo",
         "//tensorflow/compiler/mlir/hlo:all_passes",
diff --git a/tensorflow/compiler/xla/pjrt/mlir_to_hlo.cc b/tensorflow/compiler/xla/pjrt/mlir_to_hlo.cc
index ec676de11cdcf1..f5da49bf3be51f 100644
--- a/tensorflow/compiler/xla/pjrt/mlir_to_hlo.cc
+++ b/tensorflow/compiler/xla/pjrt/mlir_to_hlo.cc
@@ -15,11 +15,16 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
 
+#include <utility>
+
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Parser.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h"
@@ -59,4 +64,35 @@ Status MlirToXlaComputation(mlir::ModuleOp module,
   return Status::OK();
 }
 
+StatusOr<mlir::OwningModuleRef> ParseMlirModuleString(
+    absl::string_view mlir_module_str, mlir::MLIRContext& context) {
+  mlir::OwningModuleRef module;
+  context.loadDialect<mlir::StandardOpsDialect>();
+  context.loadDialect<mlir::mhlo::MhloDialect>();
+  context.loadDialect<mlir::chlo::HloClientDialect>();
+  mlir::StatusScopedDiagnosticHandler diagnostic_handler(&context);
+  module = mlir::parseSourceString(
+      llvm::StringRef(mlir_module_str.data(), mlir_module_str.size()),
+      &context);
+  if (!module) {
+    return diagnostic_handler.ConsumeStatus();
+  }
+  if (failed(module->verify())) {
+    VLOG(1) << "MLIR verification failed.";
+    module->dump();
+    return diagnostic_handler.ConsumeStatus();
+  }
+  return std::move(module);
+}
+
+Status ParseMlirModuleStringAndConvertToXlaComputation(
+    absl::string_view mlir_module_str, XlaComputation& xla_computation,
+    bool use_tuple_args, bool return_tuple) {
+  mlir::MLIRContext context;
+  TF_ASSIGN_OR_RETURN(mlir::OwningModuleRef module,
+                      xla::ParseMlirModuleString(mlir_module_str, context));
+  return xla::MlirToXlaComputation(*module, xla_computation, use_tuple_args,
+                                   return_tuple);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/mlir_to_hlo.h b/tensorflow/compiler/xla/pjrt/mlir_to_hlo.h
index f3fe37a4673739..a742ed4716efb2 100644
--- a/tensorflow/compiler/xla/pjrt/mlir_to_hlo.h
+++ b/tensorflow/compiler/xla/pjrt/mlir_to_hlo.h
@@ -23,11 +23,20 @@ limitations under the License.
 
 namespace xla {
 
+// Converts an MHLO/CHLO module string to an mlir::Module.
+StatusOr<mlir::OwningModuleRef> ParseMlirModuleString(
+    absl::string_view mlir_module_str, mlir::MLIRContext& context);
+
 // Converts an CHLO/MHLO module to XLA HLO.
 Status MlirToXlaComputation(mlir::ModuleOp module,
                             XlaComputation& xla_computation,
                             bool use_tuple_args, bool return_tuple);
 
+// Converts an MHLO/CHLO module string to an XLA computation.
+Status ParseMlirModuleStringAndConvertToXlaComputation(
+    absl::string_view mlir_module_str, XlaComputation& xla_computation,
+    bool use_tuple_args, bool return_tuple);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_PJRT_MLIR_TO_HLO_H_
diff --git a/tensorflow/compiler/xla/pjrt/tpu_client.cc b/tensorflow/compiler/xla/pjrt/tpu_client.cc
index f2307d24ef4570..55c14aa28fa500 100644
--- a/tensorflow/compiler/xla/pjrt/tpu_client.cc
+++ b/tensorflow/compiler/xla/pjrt/tpu_client.cc
@@ -99,7 +99,21 @@ PjRtTpuClient::PjRtTpuClient(
         return absl::StrCat(
             "libtpu version ", absl::StrJoin(version.version, "."), "\n",
             absl::string_view(version.metadata, version.metadata_size));
-      }()) {}
+      }()) {
+  // We always initialize the tpu client even if libtpu isn't linked in or
+  // initialized.
+  if (tf_tpu::ExecutorApiFn()->TpuAsyncCollectiveOffloadHelper_InitFn !=
+      nullptr) {
+    tf_tpu::ExecutorApiFn()->TpuAsyncCollectiveOffloadHelper_InitFn();
+  }
+}
+
+PjRtTpuClient::~PjRtTpuClient() {
+  if (tf_tpu::ExecutorApiFn()->TpuAsyncCollectiveOffloadHelper_ShutdownFn !=
+      nullptr) {
+    tf_tpu::ExecutorApiFn()->TpuAsyncCollectiveOffloadHelper_ShutdownFn();
+  }
+}
 
 StatusOr<DeviceAssignment> PjRtTpuClient::GetDefaultDeviceAssignment(
     int num_replicas, int num_partitions) const {
diff --git a/tensorflow/compiler/xla/pjrt/tpu_client.h b/tensorflow/compiler/xla/pjrt/tpu_client.h
index f56521d8bef7f9..d1df79c0ed7b92 100644
--- a/tensorflow/compiler/xla/pjrt/tpu_client.h
+++ b/tensorflow/compiler/xla/pjrt/tpu_client.h
@@ -57,6 +57,7 @@ class PjRtTpuClient : public PjRtStreamExecutorClient {
   PjRtTpuClient(LocalClient* client,
                 std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices,
                 int process_index);
+  ~PjRtTpuClient() override;
 
   absl::string_view platform_version() const override {
     return platform_version_;
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 0c53882888f8a4..4734eca21eee39 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -237,7 +237,9 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/pjrt:pjrt_stream_executor_client",
         "//tensorflow/compiler/xla/pjrt:transpose",
         "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/core/platform:fingerprint",
@@ -490,10 +492,10 @@ cc_library(
         "//tensorflow/compiler/mlir/xla:hlo_to_mlir_hlo",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:CAPIIR",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MLIRBindingsPythonHeadersAndDeps",
+        "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:StandardOps",
         "@pybind11",
     ],
diff --git a/tensorflow/compiler/xla/python/jax_jit.cc b/tensorflow/compiler/xla/python/jax_jit.cc
index afd02e9b61c439..b5fc1e9a868b86 100644
--- a/tensorflow/compiler/xla/python/jax_jit.cc
+++ b/tensorflow/compiler/xla/python/jax_jit.cc
@@ -1087,6 +1087,9 @@ PyObject* JaxCompiledFunction_tp_call(PyObject* self, PyObject* args,
   } catch (std::invalid_argument& e) {
     PyErr_SetString(PyExc_ValueError, e.what());
     return nullptr;
+  } catch (std::runtime_error& e) {
+    PyErr_SetString(PyExc_ValueError, e.what());
+    return nullptr;
   }
 }
 
diff --git a/tensorflow/compiler/xla/python/mlir.cc b/tensorflow/compiler/xla/python/mlir.cc
index e7d66c8ef395f8..50edab378b1662 100644
--- a/tensorflow/compiler/xla/python/mlir.cc
+++ b/tensorflow/compiler/xla/python/mlir.cc
@@ -13,26 +13,32 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Parser.h"  // from @llvm-project
 #include "pybind11/pybind11.h"
+#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/hlo_to_mlir_hlo.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/status.h"
 
 namespace py = pybind11;
 
 namespace xla {
+namespace {
 
 // Converts an XlaComputation to an MHLO mlir::Module string. Exists for
 // backwards compatibility.
 // TODO(phawkins): port remaining users of XlaComputations to use mlir::Modules
 // instead and delete this function.
-StatusOr<std::string> XlaComputationToMlirModule(
+StatusOr<std::string> PyXlaComputationToMlirModule(
     const XlaComputation& computation) {
   mlir::MLIRContext context;
   mlir::OwningModuleRef module =
@@ -47,11 +53,43 @@ StatusOr<std::string> XlaComputationToMlirModule(
   return s;
 }
 
+StatusOr<XlaComputation> PyMlirModuleToXlaComputation(std::string mlir_module,
+                                                      bool use_tuple_args,
+                                                      bool return_tuple) {
+  mlir::MLIRContext context;
+  mlir::OwningModuleRef module;
+  context.loadDialect<mlir::StandardOpsDialect>();
+  context.loadDialect<mlir::mhlo::MhloDialect>();
+  context.loadDialect<mlir::chlo::HloClientDialect>();
+  mlir::StatusScopedDiagnosticHandler diagnostic_handler(&context);
+  module = mlir::parseSourceString(
+      llvm::StringRef(mlir_module.data(), mlir_module.size()), &context);
+  if (!module) {
+    return diagnostic_handler.ConsumeStatus();
+  }
+  if (failed(module->verify())) {
+    VLOG(1) << "MLIR verification failed.";
+    module->dump();
+    return diagnostic_handler.ConsumeStatus();
+  }
+
+  XlaComputation computation;
+  TF_RETURN_IF_ERROR(
+      MlirToXlaComputation(*module, computation, use_tuple_args, return_tuple));
+  return computation;
+}
+
+}  // namespace
+
 void BuildMlirSubmodule(py::module& m) {
   py::module mlir_module = m.def_submodule("mlir", "MLIR/XLA integration");
 
   mlir_module.def("xla_computation_to_mlir_module",
-                  &XlaComputationToMlirModule);
+                  &PyXlaComputationToMlirModule);
+  mlir_module.def("mlir_module_to_xla_computation",
+                  &PyMlirModuleToXlaComputation, py::arg("mlir_module"),
+                  py::arg("use_tuple_args") = false,
+                  py::arg("return_tuple") = false);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/py_client.cc b/tensorflow/compiler/xla/python/py_client.cc
index 29c797c629e551..7ca89c3d6567f2 100644
--- a/tensorflow/compiler/xla/python/py_client.cc
+++ b/tensorflow/compiler/xla/python/py_client.cc
@@ -21,12 +21,9 @@ limitations under the License.
 
 #include "absl/base/casts.h"
 #include "absl/container/flat_hash_map.h"
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/Parser.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/chlo_ops.h"
-#include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
 #include "tensorflow/compiler/xla/pjrt/transpose.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/python/py_buffer.h"
@@ -121,7 +118,57 @@ std::vector<std::shared_ptr<PyExecutable>> PyClient::LiveExecutables() {
 
 Status PyClient::Defragment() {
   CHECK(PyGILState_Check());
-  return pjrt_client_->Defragment();
+  switch (pjrt_client_->runtime_type()) {
+    case PjRtRuntimeType::kTfrt:
+      return pjrt_client_->Defragment();
+    case PjRtRuntimeType::kStreamExecutor:
+      struct TmpBuffer {
+        PyBuffer* py_buffer;
+        // TODO(skyewm): maybe use py_buffer's HostValue
+        std::shared_ptr<Literal> host_copy;
+      };
+
+      // Synchronously copy all buffers to host
+      std::vector<TmpBuffer> tmp_buffers;
+      for (PyBuffer* device_buffers : buffers_) {
+        for (PyBuffer* buffer = device_buffers; buffer;
+             buffer = buffer->next_) {
+          if (!buffer->is_deleted()) {
+            TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal,
+                                buffer->buffer_->ToLiteral());
+            tmp_buffers.push_back({buffer, literal});
+          }
+        }
+      }
+
+      // All buffers successfully copied to host, delete on-device copies.
+      //
+      // Use blocking delete operation to ensure all memory is actually cleared
+      // before we start rewriting buffers.
+      //
+      // Die instead of returning a bad status because program presumably can't
+      // continue if we fail to reconstitute device buffers.
+      for (TmpBuffer& tmp_buffer : tmp_buffers) {
+        TF_CHECK_OK(tensorflow::down_cast<PjRtStreamExecutorBuffer*>(
+                        tmp_buffer.py_buffer->buffer_.get())
+                        ->Release(/*wait_for_operations_to_complete=*/true)
+                        .status());
+      }
+
+      // Copy host copies back to device and update PyBuffers in-place.
+      for (TmpBuffer& tmp_buffer : tmp_buffers) {
+        std::unique_ptr<PjRtBuffer> new_copy =
+            pjrt_client_
+                ->BufferFromHostLiteral(*tmp_buffer.host_copy,
+                                        tmp_buffer.py_buffer->buffer_->device())
+                .ValueOrDie();
+        TF_CHECK_OK(new_copy->BlockHostUntilReady());
+        tmp_buffer.py_buffer->buffer_.reset(new_copy.release());
+      }
+
+      // TODO(skyewm): delete executables?
+  }
+  return Status::OK();
 }
 
 StatusOr<std::vector<std::vector<ClientAndPtr<PjRtDevice>>>>
@@ -217,22 +264,8 @@ StatusOr<std::shared_ptr<PyExecutable>> PyClient::CompileMlir(
   {
     py::gil_scoped_release gil_release;
     mlir::MLIRContext context;
-    mlir::OwningModuleRef module;
-    context.loadDialect<mlir::StandardOpsDialect>();
-    context.loadDialect<mlir::mhlo::MhloDialect>();
-    context.loadDialect<mlir::chlo::HloClientDialect>();
-    mlir::StatusScopedDiagnosticHandler diagnostic_handler(&context);
-    module = mlir::parseSourceString(
-        llvm::StringRef(mlir_module.data(), mlir_module.size()), &context);
-    if (!module) {
-      return diagnostic_handler.ConsumeStatus();
-    }
-    if (failed(module->verify())) {
-      VLOG(1) << "MLIR verification failed.";
-      module->dump();
-      return diagnostic_handler.ConsumeStatus();
-    }
-
+    TF_ASSIGN_OR_RETURN(mlir::OwningModuleRef module,
+                        ParseMlirModuleString(mlir_module, context));
     TF_ASSIGN_OR_RETURN(
         executable, pjrt_client_->Compile(module.get(), std::move(options)));
     TF_ASSIGN_OR_RETURN(fingerprint,
diff --git a/tensorflow/compiler/xla/python/pytree.cc b/tensorflow/compiler/xla/python/pytree.cc
index dd7ac3998ed3dc..5a46b26e2101cc 100644
--- a/tensorflow/compiler/xla/python/pytree.cc
+++ b/tensorflow/compiler/xla/python/pytree.cc
@@ -159,7 +159,7 @@ void PyTreeDef::FlattenIntoImpl(
         py::list keys =
             py::reinterpret_steal<py::list>(PyDict_Keys(dict.ptr()));
         if (PyList_Sort(keys.ptr())) {
-          throw std::runtime_error("Dictionary key sort failed.");
+          throw py::error_already_set();
         }
         for (py::handle key : keys) {
           recurse(dict[key]);
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 3a4ead9bb57c22..141305cfbb42a0 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -44,7 +44,7 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes.
-_version = 44
+_version = 46
 
 xla_platform_names = {
     'cpu': 'Host',
@@ -378,7 +378,7 @@ def window_padding_type_to_pad_values(padding_type, lhs_dims, rhs_dims,
 Buffer = _xla.Buffer
 DeviceArrayBase = _xla.DeviceArrayBase
 Executable = _xla.Executable
-OpSharding = _xla.OpSharding  # type: ignore
+OpSharding = _xla.OpSharding
 
 
 def register_custom_call_target(name, fn, platform='cpu'):
diff --git a/tensorflow/compiler/xla/python/xla_compiler.cc b/tensorflow/compiler/xla/python/xla_compiler.cc
index d3d3d4ab538d5a..d2ee69a0523843 100644
--- a/tensorflow/compiler/xla/python/xla_compiler.cc
+++ b/tensorflow/compiler/xla/python/xla_compiler.cc
@@ -703,7 +703,10 @@ void BuildXlaCompilerSubmodule(py::module& m) {
       .def_property("replicate_on_last_tile_dim",
                     &xla::OpSharding::replicate_on_last_tile_dim,
                     &xla::OpSharding::set_replicate_on_last_tile_dim)
-      .def("__repr__", &xla::OpSharding::DebugString);
+      .def("__repr__", &xla::OpSharding::DebugString)
+      .def("SerializeToString", [](const OpSharding& sharding) {
+        return py::bytes(sharding.SerializeAsString());
+      });
   DefRepeatedProperty(op_sharding, "tile_assignment_dimensions",
                       &xla::OpSharding::mutable_tile_assignment_dimensions);
   DefRepeatedProperty(op_sharding, "tile_assignment_devices",
diff --git a/tensorflow/compiler/xla/python/xla_extension/__init__.pyi b/tensorflow/compiler/xla/python/xla_extension/__init__.pyi
index 2d51dde006d233..5ddad2c1ad9356 100644
--- a/tensorflow/compiler/xla/python/xla_extension/__init__.pyi
+++ b/tensorflow/compiler/xla/python/xla_extension/__init__.pyi
@@ -241,7 +241,11 @@ class OpSharding_Type(enum.IntEnum):
 class OpSharding:
   Type: typing.Type[OpSharding_Type]
   type: OpSharding_Type
-  replicate_on_last_dim: bool
+  replicate_on_last_tile_dim: bool
+  tile_assignment_dimensions: Sequence[int]
+  tile_assignment_devices: Sequence[int]
+  tuple_shardings: Sequence[OpSharding]
+  def SerializeToString(self) -> bytes: ...
 
 class ChannelHandle_ChannelType(enum.IntEnum):
   CHANNEL_TYPE_INVALID: int
diff --git a/tensorflow/compiler/xla/python/xla_extension/mlir.pyi b/tensorflow/compiler/xla/python/xla_extension/mlir.pyi
index 788104c44ed75f..b987aa597068d9 100644
--- a/tensorflow/compiler/xla/python/xla_extension/mlir.pyi
+++ b/tensorflow/compiler/xla/python/xla_extension/mlir.pyi
@@ -16,3 +16,5 @@
 from . import XlaComputation
 
 def xla_computation_to_mlir_module(computation: XlaComputation) -> str: ...
+def mlir_module_to_xla_computation(mlir_module: str, use_tuple_args: bool = ...,
+                                   return_tuple: bool = ...) -> str: ...
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 6b922f9023ed2d..df57a7f71e659e 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -311,6 +311,8 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/platform:regexp",
         "@com_google_absl//absl/strings",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Transforms",
     ],
 )
 
@@ -4140,7 +4142,10 @@ cc_library(
 cc_library(
     name = "copy_insertion",
     srcs = ["copy_insertion.cc"],
-    hdrs = ["copy_insertion.h"],
+    hdrs = [
+        "compile_time_cap.h",
+        "copy_insertion.h",
+    ],
     deps = [
         ":dump",
         ":hlo",
@@ -4155,7 +4160,6 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service/graphcycles",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -5437,7 +5441,10 @@ tf_cc_test(
 cc_library(
     name = "while_loop_invariant_code_motion",
     srcs = ["while_loop_invariant_code_motion.cc"],
-    hdrs = ["while_loop_invariant_code_motion.h"],
+    hdrs = [
+        "compile_time_cap.h",
+        "while_loop_invariant_code_motion.h",
+    ],
     deps = [
         ":hlo",
         ":hlo_dce",
@@ -5452,6 +5459,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index b8be038c0a529c..cc3733382add18 100755
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -5883,6 +5883,10 @@ StatusOr<bool> AlgebraicSimplifierVisitor::SimplifyConvToDot(
     return false;
   }
 
+  if (convolution->feature_group_count() != 1 ||
+      convolution->batch_group_count() != 1) {
+    return false;
+  }
   auto add_bitcast = [&](Shape shape, HloInstruction* operand) {
     std::vector<int64_t> dims(operand->shape().dimensions_size());
     std::iota(dims.begin(), dims.end(), 0);
diff --git a/tensorflow/compiler/xla/service/compile_time_cap.h b/tensorflow/compiler/xla/service/compile_time_cap.h
new file mode 100644
index 00000000000000..9a1411b4c25dbc
--- /dev/null
+++ b/tensorflow/compiler/xla/service/compile_time_cap.h
@@ -0,0 +1,63 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COMPILE_TIME_CAP_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_COMPILE_TIME_CAP_H_
+#include <algorithm>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+
+namespace xla {
+// Provide a common way to bound compiler analyses that potentially have
+// overhead that is non-linear to the number of instructions in a module.
+class BoundNonLinearCompilerAnalysis {
+ public:
+  // Sampling_rate specifies the proportion of all instructions expected to be
+  // analyzed. e.g., if sampling_rate_=2, then every other instructions are
+  // expected to be analyzed. If sample_rate <= 0, the analysis will be always
+  // allowed to complete. Each analysis is allowed at least a constant number of
+  // abstract cost units, before it is considered for early termination.
+  explicit BoundNonLinearCompilerAnalysis(HloModule* m,
+                                          absl::string_view pass_name,
+                                          absl::optional<int64_t> sampling_rate)
+      : analysis_allowance_(
+            (!sampling_rate.has_value() || sampling_rate.value() <= 0 ||
+             m->config().GetAnalysisAllowance(pass_name) < 0)
+                ? -1
+                : std::max(m->config().GetAnalysisAllowance(pass_name),
+                           m->instruction_count() / sampling_rate.value())) {}
+  // Return whether the cost is deducted successfully. If not, the analysis
+  // should be terminated as its overhead is too high.
+  bool DeductCost(int64_t cost_now) {
+    if (analysis_allowance_ > 0 && cost_now > 0) {
+      analysis_allowance_ -= cost_now;
+      if (analysis_allowance_ < 0) {
+        analysis_allowance_ = 0;
+      }
+    }
+    return analysis_allowance_ != 0;
+  }
+
+  bool ContinueAnalysis() const { return analysis_allowance_ != 0; }
+  int64_t analysis_allowance() const { return analysis_allowance_; }
+
+ private:
+  int64_t analysis_allowance_;
+};
+
+};  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_COMPILE_TIME_CAP_H_
diff --git a/tensorflow/compiler/xla/service/convolution_group_converter.cc b/tensorflow/compiler/xla/service/convolution_group_converter.cc
index 5fd9e8c745d617..1541760b87b884 100644
--- a/tensorflow/compiler/xla/service/convolution_group_converter.cc
+++ b/tensorflow/compiler/xla/service/convolution_group_converter.cc
@@ -56,6 +56,7 @@ class ConvolutionVisitor : public DfsHloVisitorWithDefault {
 
   // Runs the visitor on a computation.
   static bool Run(HloComputation* computation,
+                  std::function<bool(HloInstruction*)> should_expand,
                   std::function<bool(HloInstruction*)> is_cost_viable,
                   bool convert_batch_groups_only, bool filter_expansion);
 
@@ -67,11 +68,13 @@ class ConvolutionVisitor : public DfsHloVisitorWithDefault {
  private:
   explicit ConvolutionVisitor(
       HloComputation* computation,
+      std::function<bool(HloInstruction*)> should_expand,
       std::function<bool(HloInstruction*)> is_cost_viable,
       bool convert_batch_groups_only, bool filter_expansion)
       : computation_(computation),
         filter_expansion_(filter_expansion),
         convert_batch_groups_only_(convert_batch_groups_only),
+        should_expand_(should_expand),
         is_cost_viable_(is_cost_viable) {}
 
   // Current HloComputation instance the ConvolutionVisitor is traversing.
@@ -86,15 +89,16 @@ class ConvolutionVisitor : public DfsHloVisitorWithDefault {
   // Decides whether to convert batch groups or feature groups.
   bool convert_batch_groups_only_;
 
-  // std::function<std::vector<LloValue*>(int64, int64)> chunk_fetcher
+  std::function<bool(HloInstruction*)> should_expand_;
   std::function<bool(HloInstruction*)> is_cost_viable_;
 };
 
 bool ConvolutionVisitor::Run(
     HloComputation* computation,
+    std::function<bool(HloInstruction*)> should_expand,
     std::function<bool(HloInstruction*)> is_cost_viable,
     bool convert_batch_groups_only, bool filter_expansion) {
-  ConvolutionVisitor visitor(computation, is_cost_viable,
+  ConvolutionVisitor visitor(computation, should_expand, is_cost_viable,
                              convert_batch_groups_only, filter_expansion);
   TF_CHECK_OK(computation->Accept(&visitor));
   return visitor.changed_;
@@ -204,7 +208,8 @@ Status ConvolutionVisitor::HandleBatchGroupCount(HloInstruction* convolution) {
   auto filter = convolution->mutable_operand(1);
   int64_t batch_group_count = convolution->batch_group_count();
 
-  if (batch_group_count == 1) {
+  if (batch_group_count == 1 ||
+      (should_expand_ && !should_expand_(convolution))) {
     return Status::OK();
   }
 
@@ -424,7 +429,7 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
   };
 
   int64_t group_count = convolution->feature_group_count();
-  if (group_count == 1) {
+  if (group_count == 1 || (should_expand_ && !should_expand_(convolution))) {
     return Status::OK();
   }
 
@@ -675,7 +680,7 @@ StatusOr<bool> ConvolutionGroupConverter::Run(HloModule* module) {
       2, "ConvolutionGroupConverter::Run(), before:\n" + module->ToString());
   bool changed = false;
   for (auto* comp : module->MakeNonfusionComputations()) {
-    if (ConvolutionVisitor::Run(comp, is_cost_viable_,
+    if (ConvolutionVisitor::Run(comp, should_expand_, is_cost_viable_,
                                 convert_batch_groups_only_,
                                 filter_expansion_)) {
       changed = true;
diff --git a/tensorflow/compiler/xla/service/convolution_group_converter.h b/tensorflow/compiler/xla/service/convolution_group_converter.h
index a8a91ed1018518..9459f8bea844e3 100644
--- a/tensorflow/compiler/xla/service/convolution_group_converter.h
+++ b/tensorflow/compiler/xla/service/convolution_group_converter.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_GROUP_CONVERTER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CONVOLUTION_GROUP_CONVERTER_H_
 
+#include <functional>
+
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
@@ -27,10 +29,12 @@ namespace xla {
 // convolutions with feature_group_count = 1.
 class ConvolutionGroupConverter : public HloModulePass {
  public:
-  ConvolutionGroupConverter(std::function<bool(HloInstruction*)> is_cost_viable,
+  ConvolutionGroupConverter(std::function<bool(HloInstruction*)> should_expand,
+                            std::function<bool(HloInstruction*)> is_cost_viable,
                             bool convert_batch_groups_only,
                             bool filter_expansion = true)
-      : is_cost_viable_(is_cost_viable),
+      : should_expand_(should_expand),
+        is_cost_viable_(is_cost_viable),
         convert_batch_groups_only_(convert_batch_groups_only),
         filter_expansion_(filter_expansion) {}
 
@@ -42,6 +46,10 @@ class ConvolutionGroupConverter : public HloModulePass {
   // computation was changed.
   StatusOr<bool> Run(HloModule* module) override;
 
+  // Predicate that determines whether this pass should rewrite a given
+  // convolution.
+  std::function<bool(HloInstruction*)> should_expand_;
+
   // Lambda containing cost model that decides whether to expand
   // batch_group_count.
   std::function<bool(HloInstruction*)> is_cost_viable_;
diff --git a/tensorflow/compiler/xla/service/convolution_group_converter_test.cc b/tensorflow/compiler/xla/service/convolution_group_converter_test.cc
index 143e071dc3c154..78d396d3a4656e 100644
--- a/tensorflow/compiler/xla/service/convolution_group_converter_test.cc
+++ b/tensorflow/compiler/xla/service/convolution_group_converter_test.cc
@@ -48,9 +48,10 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,2], filter: f32[1,1,2]) -> f32[1,2
   auto computation = module->entry_computation();
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
+  auto should_expand = [](HloInstruction* conv) { return true; };
   auto cost_model = [](HloInstruction* conv) { return true; };
-  ConvolutionGroupConverter converter(cost_model, /*convert_batch_groups_only=*/
-                                      false);
+  ConvolutionGroupConverter converter(should_expand, cost_model,
+                                      /*convert_batch_groups_only=*/false);
   ASSERT_TRUE(converter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
   // Make sure the convolution is converted to one with feature_group_count = 1.
@@ -80,8 +81,10 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,4], filter: f32[1,2,2]) -> f32[1,2
   auto computation = module->entry_computation();
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
+  auto should_expand = [](HloInstruction* conv) { return true; };
   auto cost_model = [](HloInstruction* conv) { return true; };
-  ConvolutionGroupConverter converter(cost_model, /*convert_batch_groups_only=*/
+  ConvolutionGroupConverter converter(should_expand,
+                                      cost_model, /*convert_batch_groups_only=*/
                                       false);
   ASSERT_TRUE(converter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
@@ -107,8 +110,10 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[16,19,19,512]{3,2,1,0}, filter: f32[16
   auto computation = module->entry_computation();
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
+  auto should_expand = [](HloInstruction* conv) { return true; };
   auto cost_model = [](HloInstruction* conv) { return false; };
-  ConvolutionGroupConverter converter(cost_model, /*convert_batch_groups_only=*/
+  ConvolutionGroupConverter converter(should_expand,
+                                      cost_model, /*convert_batch_groups_only=*/
                                       true);
   ASSERT_TRUE(converter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
@@ -134,8 +139,10 @@ TEST_F(ConvolutionGroupConverterTest,
   auto computation = module->entry_computation();
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
+  auto should_expand = [](HloInstruction* conv) { return true; };
   auto cost_model = [](HloInstruction* conv) { return false; };
-  ConvolutionGroupConverter converter(cost_model, /*convert_batch_groups_only=*/
+  ConvolutionGroupConverter converter(should_expand,
+                                      cost_model, /*convert_batch_groups_only=*/
                                       true);
   // Make sure that batch group count is rewritten even if
   // batch_group_count == output_feature but not input_batch
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index 909335e2373d56..55c40d0a1a0f40 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
 
+#include <algorithm>
 #include <optional>
 #include <sstream>
 
@@ -24,6 +25,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/any.h"
+#include "tensorflow/compiler/xla/service/compile_time_cap.h"
 #include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -1983,11 +1985,9 @@ Status CopyInsertion::RemoveUnnecessaryCopies(HloOrdering* ordering,
   int64_t num_existing_copies = GetNumExistingCopies(module);
   bool changed = true;
   int64_t num_iterations = -1;
-  constexpr int64_t region_analysis_allowance_cap = 30000;
   VLOG(6) << "Copy Insertion analyzing module with instructino count = "
           << module->instruction_count() << "\n";
-  int64_t region_analysis_allowance =
-      std::max(region_analysis_allowance_cap, module->instruction_count() / 10);
+  BoundNonLinearCompilerAnalysis allowance(module, name(), 10);
   while (changed) {
     CHECK_LE(++num_iterations, num_existing_copies);
     changed = false;
@@ -2000,8 +2000,11 @@ Status CopyInsertion::RemoveUnnecessaryCopies(HloOrdering* ordering,
         // The region_analysis_cost_now is always set to
         // use_region_based_live_range_analysis_ if it is < 0, in which case the
         // analysis is always performed.
-        int64_t region_analysis_cost_now = std::min(
-            region_analysis_allowance, use_region_based_live_range_analysis_);
+        int64_t region_analysis_cost_now =
+            (use_region_based_live_range_analysis_ == 0)
+                ? 0
+                : std::min(allowance.analysis_allowance(),
+                           use_region_based_live_range_analysis_);
         if (instruction->opcode() == HloOpcode::kCopy) {
           if (copy_remover.TryElideCopy(instruction,
                                         &region_analysis_cost_now)) {
@@ -2011,14 +2014,12 @@ Status CopyInsertion::RemoveUnnecessaryCopies(HloOrdering* ordering,
                 instruction->mutable_operand(0)));
             VLOG(6) << "succeeded in eliminating copy.\n";
           }
-          if (region_analysis_allowance > 0 && region_analysis_cost_now > 0) {
+          if (allowance.ContinueAnalysis() && region_analysis_cost_now > 0) {
             VLOG(6) << "Copy Insertion analyzing module cost: "
                     << region_analysis_cost_now << "\n";
             VLOG(6) << "instruction:" << instruction->ToString() << "\n";
-            region_analysis_allowance -= region_analysis_cost_now;
-            if (region_analysis_allowance < 0) {
-              region_analysis_allowance = 0;
-            }
+            allowance.DeductCost(region_analysis_cost_now);
+            VLOG(6) << "allowance:" << allowance.analysis_allowance() << "\n";
           }
         }
       }
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index 48e372ce6413cb..e6e817728fa0ad 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -66,16 +66,30 @@ class FilteredPassManager : public llvm::legacy::PassManager {
   explicit FilteredPassManager(bool disable_expensive_passes)
       : disable_expensive_passes_(disable_expensive_passes) {}
   void add(llvm::Pass* p) override {
-    bool pass_disabled =
-        disable_expensive_passes_ && p->getPassName().contains("Unroll loops");
-    if (!pass_disabled) {
-      llvm::legacy::PassManager::add(p);
-    } else {
+    // Disable all the loop unroll passes in the pipeline if
+    // `disable_expensive_passes_` is true (TODO: Maybe we should use
+    // `builder.DisableUnrollLoops` for this legacy feature?). Disable only the
+    // early loop full unroll pass, otherwise. The early loop full unroll pass
+    // applies excesive unrolling in statically bounded low trip-count loops,
+    // which are very common in XLA. It also creates a strong dependency on the
+    // SLP vectorizer to produce all the vector code, since the loops are fully
+    // unrolled. By disabling it, the Loop Vectorizer would have an opportunity
+    // to vectorize the code. A later loop unroll pass will still unroll the
+    // loops before SLP for those cases missed by the Loop Vectorizer.
+    constexpr unsigned loop_full_unroll_pos = 0;
+    if (p->getPassName().contains("Unroll loops") &&
+        (disable_expensive_passes_ ||
+         num_unroll_passes_ == loop_full_unroll_pos)) {
+      ++num_unroll_passes_;
       delete p;
+      return;
     }
+
+    llvm::legacy::PassManager::add(p);
   }
 
  private:
+  unsigned num_unroll_passes_ = 0;
   bool disable_expensive_passes_;
 };
 }  // anonymous namespace
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 5e9c150c3e3b2f..19c9f243c65d3e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -399,10 +399,19 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
     return false;
   };
   pipeline.AddPass<ConvolutionGroupConverter>(
-      cost_model,
+      /*should_expand=*/[](HloInstruction* conv) { return true; }, cost_model,
       /*convert_batch_groups_only=*/true);
+  auto feature_group_should_expand = [](HloInstruction* conv) {
+    switch (conv->shape().element_type()) {
+      case F16:
+      case F32:
+        return false;
+      default:
+        return true;
+    }
+  };
   pipeline.AddPass<ConvolutionGroupConverter>(
-      cost_model,
+      feature_group_should_expand, cost_model,
       /*convert_batch_groups_only=*/false);
   pipeline.AddPass<BatchNormExpander>(
       /*rewrite_training_op=*/true,
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 6918392561d965..e0181e9ddd4617 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -654,7 +654,7 @@ void DotOpEmitter::EmitNaiveLlvmIrGemm() {
   llvm::Value* lhs_element = lhs_array_.EmitReadArrayElement(lhs_index, b_);
   llvm::Value* rhs_element = rhs_array_.EmitReadArrayElement(rhs_index, b_);
 
-  llvm::Value* accum = b_->CreateLoad(accum_address);
+  llvm::Value* accum = b_->CreateLoad(accum_type, accum_address);
   llvm::Value* updated_accum;
   if (ShapeUtil::ElementIsComplex(lhs_shape)) {
     auto real = [&](llvm::Value* x) { return b_->CreateExtractValue(x, {0}); };
@@ -686,7 +686,7 @@ void DotOpEmitter::EmitNaiveLlvmIrGemm() {
   // - Store into output array.
   SetToFirstInsertPoint(reduction_loop->GetExitBasicBlock(), b_);
 
-  llvm::Value* result = b_->CreateLoad(accum_address);
+  llvm::Value* result = b_->CreateLoad(accum_type, accum_address);
 
   // Create index into target address. The target index is the concatenation of
   // the rhs and lhs indexes with the reduction dimensions removed. The terms
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
index 8907146f6f9e46..21331ffb10f31f 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
@@ -84,7 +84,7 @@ bool PotentiallyImplementedAsEigenConvolution(
       convolution.convolution_dimension_numbers();
   // Only 1D through 3D convolutions are supported at the moment.
   const int64_t num_spatial_dims = dnums.output_spatial_dimensions_size();
-  if (num_spatial_dims > 3) {
+  if (num_spatial_dims < 1 || num_spatial_dims > 3) {
     return false;
   }
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 3256f478aa02c5..1a5467210f816e 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -967,12 +967,13 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
       bool multi_threaded =
           hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
       bool use_mkl_dnn =
-          hlo_module_config_.debug_options().xla_cpu_use_mkl_dnn();
+          hlo_module_config_.debug_options().xla_cpu_use_mkl_dnn() &&
+          convolution->feature_group_count() == 1;
 
       auto valid_num_dims = [](absl::Span<const int64_t> xs) {
         return xs.size() >= 2 && xs.size() <= 3;
       };
-      TF_RET_CHECK(valid_num_dims(input_dims));
+      TF_RET_CHECK(valid_num_dims(input_dims)) << input_dims.size();
       TF_RET_CHECK(valid_num_dims(kernel_dims));
       TF_RET_CHECK(valid_num_dims(output_dims));
       TF_RET_CHECK(valid_num_dims(strides));
@@ -1041,13 +1042,13 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
       for (int64_t d : window_dilation) {
         args.push_back(b_.getInt64(d));
       }
+      args.push_back(b_.getInt64(convolution->feature_group_count()));
       EmitCallToFunc(fn_name, args, b_.getVoidTy(), /*does_not_throw=*/true,
                      /*only_accesses_arg_memory=*/true);
 
       return Status::OK();
     }
   }
-
   // This is a completely un-optimized version of convolution just to
   // have an early version that works. E.g. the input index and
   // padding calculation is not hoisted out of the inner loop.
@@ -2208,7 +2209,7 @@ Status IrEmitter::HandleSliceToDynamic(HloInstruction* hlo) {
   for (int64_t i = 1; i < hlo->operand_count(); ++i) {
     const int64_t dim_index = i - 1;
     llvm::Value* source_buffer = GetEmittedValueFor(hlo->operand(i));
-    llvm::LoadInst* dyn_dim_size = b_.CreateLoad(source_buffer, "dyn_dim_size");
+    llvm::LoadInst* dyn_dim_size = Load(source_buffer, "dyn_dim_size");
 
     llvm::Value* metadata = b_.CreateConstInBoundsGEP1_32(
         b_.getInt8Ty(), raw_buffer, raw_data_size + dim_index * sizeof(int32));
@@ -2275,6 +2276,7 @@ Status IrEmitter::HandlePadToStatic(HloInstruction* hlo) {
     llvm::Value* metadata = b_.CreateConstInBoundsGEP1_32(
         b_.getInt8Ty(), raw_buffer, raw_data_size + dim_index * sizeof(int32));
     llvm::Value* dyn_dim_size = b_.CreateLoad(
+        b_.getInt32Ty(),
         b_.CreateBitCast(metadata, b_.getInt32Ty()->getPointerTo()),
         "dyn_dim_size");
     b_.CreateStore(dyn_dim_size,
@@ -2925,7 +2927,8 @@ void IrEmitter::ProfilingState::UpdateProfileCounter(llvm::IRBuilder<>* b,
                                                      llvm::Value* cycle_start) {
   auto* cycle_diff = b->CreateSub(cycle_end, cycle_start);
   llvm::LoadInst* old_cycle_count =
-      b->CreateLoad(prof_counter, "old_cycle_count");
+      b->CreateLoad(prof_counter->getType()->getPointerElementType(),
+                    prof_counter, "old_cycle_count");
   auto* new_cycle_count =
       b->CreateAdd(cycle_diff, old_cycle_count, "new_cycle_count");
   b->CreateStore(new_cycle_count, prof_counter);
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.cc b/tensorflow/compiler/xla/service/cpu/ir_function.cc
index 384948e0c6ee39..cb083d59e24841 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_function.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.cc
@@ -192,9 +192,11 @@ void IrFunction::Initialize(const string& function_name,
 llvm::Value* IrFunction::GetDynamicLoopBound(const int64_t offset) {
   CHECK_GT(num_dynamic_loop_bounds_, 0);
   CHECK_LT(offset, num_dynamic_loop_bounds_ * 2);
-  string name = absl::StrCat("dynamic_loop_bound_", offset);
-  return b_->CreateLoad(b_->CreateGEP(CHECK_NOTNULL(dynamic_loop_bounds_arg_),
-                                      b_->getInt64(offset), name));
+  llvm::Type* type =
+      dynamic_loop_bounds_arg_->getType()->getPointerElementType();
+  auto gep = b_->CreateGEP(type, CHECK_NOTNULL(dynamic_loop_bounds_arg_),
+                           b_->getInt64(offset));
+  return b_->CreateLoad(type, gep, "dynamic_loop_bound_" + llvm::Twine(offset));
 }
 
 llvm::Value* EncodeArrayFunctionArguments(
@@ -214,7 +216,7 @@ llvm::Value* EncodeArrayFunctionArguments(
           arguments[i], b->getInt8PtrTy(),
           absl::StrCat(name, "_parameter_", i, "_address_as_i8ptr"));
       llvm::Value* slot_in_param_addresses =
-          b->CreateInBoundsGEP(arguments_buffer, {b->getInt64(i)});
+          b->CreateInBoundsGEP(int8ptr_ty, arguments_buffer, b->getInt64(i));
       b->CreateStore(parameter_as_i8ptr, slot_in_param_addresses);
     }
   }
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_conv2d.cc b/tensorflow/compiler/xla/service/cpu/runtime_conv2d.cc
index 87ccb6480a3ba0..1f72bdcf88856c 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_conv2d.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_conv2d.cc
@@ -31,7 +31,8 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF32(
     int64_t output_cols, int64_t row_stride, int64_t col_stride,
     int64_t padding_top, int64_t padding_bottom, int64_t padding_left,
     int64_t padding_right, int64_t lhs_row_dilation, int64_t lhs_col_dilation,
-    int64_t rhs_row_dilation, int64_t rhs_col_dilation) {
+    int64_t rhs_row_dilation, int64_t rhs_col_dilation,
+    int64_t feature_group_count) {
   const xla::ExecutableRunOptions* run_options =
       static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
   XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
@@ -40,7 +41,8 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF32(
       input_rows, input_cols, input_channels, kernel_rows, kernel_cols,
       kernel_channels, kernel_filters, output_rows, output_cols, row_stride,
       col_stride, padding_top, padding_bottom, padding_left, padding_right,
-      lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation);
+      lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
+      feature_group_count);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF16(
@@ -52,7 +54,7 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF16(
     int64_t col_stride, int64_t padding_top, int64_t padding_bottom,
     int64_t padding_left, int64_t padding_right, int64_t lhs_row_dilation,
     int64_t lhs_col_dilation, int64_t rhs_row_dilation,
-    int64_t rhs_col_dilation) {
+    int64_t rhs_col_dilation, int64_t feature_group_count) {
   const xla::ExecutableRunOptions* run_options =
       static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
   XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
@@ -61,5 +63,6 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF16(
       input_rows, input_cols, input_channels, kernel_rows, kernel_cols,
       kernel_channels, kernel_filters, output_rows, output_cols, row_stride,
       col_stride, padding_top, padding_bottom, padding_left, padding_right,
-      lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation);
+      lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
+      feature_group_count);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_conv2d.h b/tensorflow/compiler/xla/service/cpu/runtime_conv2d.h
index e32e945cfdeffa..dbca5292cfdc47 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_conv2d.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_conv2d.h
@@ -30,7 +30,7 @@ extern void __xla_cpu_runtime_EigenConv2DF32(
     int64_t col_stride, int64_t padding_top, int64_t padding_bottom,
     int64_t padding_left, int64_t padding_right, int64_t lhs_row_dilation,
     int64_t lhs_col_dilation, int64_t rhs_row_dilation,
-    int64_t rhs_col_dilation);
+    int64_t rhs_col_dilation, int64_t feature_group_count);
 
 extern void __xla_cpu_runtime_EigenConv2DF16(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr,
@@ -41,7 +41,8 @@ extern void __xla_cpu_runtime_EigenConv2DF16(
     int64_t row_stride, int64_t col_stride, int64_t padding_top,
     int64_t padding_bottom, int64_t padding_left, int64_t padding_right,
     int64_t lhs_row_dilation, int64_t lhs_col_dilation,
-    int64_t rhs_row_dilation, int64_t rhs_col_dilation);
+    int64_t rhs_row_dilation, int64_t rhs_col_dilation,
+    int64_t feature_group_count);
 
 }  // extern "C"
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_conv3d.cc b/tensorflow/compiler/xla/service/cpu/runtime_conv3d.cc
index 1f394d090ff080..9b9cba40f49328 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_conv3d.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_conv3d.cc
@@ -33,7 +33,8 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF32(
     int64_t padding_x_after, int64_t padding_y_before, int64_t padding_y_after,
     int64_t padding_z_before, int64_t padding_z_after, int64_t lhs_x_dilation,
     int64_t lhs_y_dilation, int64_t lhs_z_dilation, int64_t rhs_x_dilation,
-    int64_t rhs_y_dilation, int64_t rhs_z_dilation) {
+    int64_t rhs_y_dilation, int64_t rhs_z_dilation,
+    int64_t feature_group_count) {
   const xla::ExecutableRunOptions* run_options =
       static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
   XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
@@ -44,7 +45,7 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF32(
       y_stride, z_stride, padding_x_before, padding_x_after, padding_y_before,
       padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
       lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
-      rhs_z_dilation);
+      rhs_z_dilation, feature_group_count);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF16(
@@ -57,7 +58,8 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF16(
     int64_t padding_x_after, int64_t padding_y_before, int64_t padding_y_after,
     int64_t padding_z_before, int64_t padding_z_after, int64_t lhs_x_dilation,
     int64_t lhs_y_dilation, int64_t lhs_z_dilation, int64_t rhs_x_dilation,
-    int64_t rhs_y_dilation, int64_t rhs_z_dilation) {
+    int64_t rhs_y_dilation, int64_t rhs_z_dilation,
+    int64_t feature_group_count) {
   const xla::ExecutableRunOptions* run_options =
       static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
   XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
@@ -68,5 +70,5 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF16(
       y_stride, z_stride, padding_x_before, padding_x_after, padding_y_before,
       padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
       lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
-      rhs_z_dilation);
+      rhs_z_dilation, feature_group_count);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_conv3d.h b/tensorflow/compiler/xla/service/cpu/runtime_conv3d.h
index d90d3357a22d34..dfedaf419d0197 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_conv3d.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_conv3d.h
@@ -32,7 +32,7 @@ extern void __xla_cpu_runtime_EigenConv3DF16(
     int64_t padding_y_before, int64_t padding_y_after, int64_t padding_z_before,
     int64_t padding_z_after, int64_t lhs_x_dilation, int64_t lhs_y_dilation,
     int64_t lhs_z_dilation, int64_t rhs_x_dilation, int64_t rhs_y_dilation,
-    int64_t rhs_z_dilation);
+    int64_t rhs_z_dilation, int64_t feature_group_count);
 
 extern void __xla_cpu_runtime_EigenConv3DF32(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
@@ -44,7 +44,8 @@ extern void __xla_cpu_runtime_EigenConv3DF32(
     int64_t padding_x_before, int64_t padding_x_after, int64_t padding_y_before,
     int64_t padding_y_after, int64_t padding_z_before, int64_t padding_z_after,
     int64_t lhs_x_dilation, int64_t lhs_y_dilation, int64_t lhs_z_dilation,
-    int64_t rhs_x_dilation, int64_t rhs_y_dilation, int64_t rhs_z_dilation);
+    int64_t rhs_x_dilation, int64_t rhs_y_dilation, int64_t rhs_z_dilation,
+    int64_t feature_group_count);
 
 }  // extern "C"
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_conv_impl.h b/tensorflow/compiler/xla/service/cpu/runtime_conv_impl.h
index f33e6d34e0285f..75efdf3e1777ad 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_conv_impl.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_conv_impl.h
@@ -23,25 +23,22 @@ limitations under the License.
 #include "tensorflow/core/kernels/eigen_contraction_kernel.h"
 #endif
 
-// 'tensorflow' namespace is used so that int64_t and other types don't require
-// qualification.
+// 'tensorflow' namespace is used so that types don't require qualification.
 namespace tensorflow {
 namespace xla {
 
 template <typename EigenDevice, typename ScalarType>
-void EigenConv2DImpl(const EigenDevice& device, ScalarType* out,
-                     ScalarType* lhs, ScalarType* rhs, Eigen::Index input_batch,
-                     Eigen::Index input_x, Eigen::Index input_y,
-                     Eigen::Index input_channels, Eigen::Index kernel_x,
-                     Eigen::Index kernel_y, Eigen::Index kernel_channels,
-                     Eigen::Index kernel_filters, Eigen::Index output_x,
-                     Eigen::Index output_y, Eigen::Index x_stride,
-                     Eigen::Index y_stride, Eigen::Index padding_x_before,
-                     Eigen::Index padding_x_after,
-                     Eigen::Index padding_y_before,
-                     Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation,
-                     Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,
-                     Eigen::Index rhs_y_dilation) {
+void EigenConv2DImpl(
+    const EigenDevice& device, ScalarType* out, ScalarType* lhs,
+    ScalarType* rhs, Eigen::Index input_batch, Eigen::Index input_x,
+    Eigen::Index input_y, Eigen::Index input_channels, Eigen::Index kernel_x,
+    Eigen::Index kernel_y, Eigen::Index kernel_channels,
+    Eigen::Index kernel_filters, Eigen::Index output_x, Eigen::Index output_y,
+    Eigen::Index x_stride, Eigen::Index y_stride, Eigen::Index padding_x_before,
+    Eigen::Index padding_x_after, Eigen::Index padding_y_before,
+    Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation,
+    Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,
+    Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count) {
   const Eigen::TensorMap<Eigen::Tensor<const ScalarType, 4, Eigen::RowMajor>,
                          Eigen::Aligned>
       input(lhs, input_batch, input_x, input_y, input_channels);
@@ -54,39 +51,57 @@ void EigenConv2DImpl(const EigenDevice& device, ScalarType* out,
                    Eigen::Aligned>
       output(out, input_batch, output_x, output_y, kernel_filters);
 
-  Eigen::array<Eigen::IndexPair<int64_t>, 1> contract_dims;
-  contract_dims[0] = Eigen::IndexPair<int64_t>(1, 0);
+  Eigen::array<Eigen::IndexPair<Eigen::Index>, 1> contract_dims;
+  contract_dims[0] = Eigen::IndexPair<Eigen::Index>(1, 0);
+
+  Eigen::DSizes<Eigen::Index, 5> input_reshaped_dims;
+  input_reshaped_dims[0] = input_batch;
+  input_reshaped_dims[1] = input_x;
+  input_reshaped_dims[2] = input_y;
+  input_reshaped_dims[3] = feature_group_count;
+  input_reshaped_dims[4] = input_channels / feature_group_count;
+
+  Eigen::DSizes<Eigen::Index, 5> output_reshaped_dims;
+  output_reshaped_dims[0] = input_batch;
+  output_reshaped_dims[1] = output_x;
+  output_reshaped_dims[2] = output_y;
+  output_reshaped_dims[3] = feature_group_count;
+  output_reshaped_dims[4] = kernel_filters / feature_group_count;
 
   // Molds the output of the patch extraction code into a 2d tensor:
   // - the first dimension (dims[0]): the patch values to be multiplied with the
   //   kernels
   // - the second dimension (dims[1]): everything else
-  Eigen::DSizes<int64_t, 2> pre_contract_dims;
+  Eigen::DSizes<Eigen::Index, 2> pre_contract_dims;
   pre_contract_dims[0] = output_y * output_x * input_batch;
   pre_contract_dims[1] = kernel_channels * kernel_y * kernel_x;
 
   // Molds the output of the contraction into the shape expected by the user:
-  Eigen::DSizes<int64_t, 4> post_contract_dims;
+  Eigen::DSizes<Eigen::Index, 4> post_contract_dims;
   post_contract_dims[0] = input_batch;
   post_contract_dims[1] = output_x;
   post_contract_dims[2] = output_y;
-  post_contract_dims[3] = kernel_filters;
+  post_contract_dims[3] = kernel_filters / feature_group_count;
 
-  Eigen::DSizes<int64_t, 2> kernel_dims;
+  Eigen::DSizes<Eigen::Index, 3> kernel_dims;
   kernel_dims[0] = kernel_channels * kernel_y * kernel_x;
-  kernel_dims[1] = kernel_filters;
-
-  // The row and column dimensions must be flipped when passed to Eigen.
-  output.device(device) =
-      input
-          .extract_image_patches(kernel_y, kernel_x, y_stride, x_stride,
-                                 rhs_y_dilation, rhs_x_dilation, lhs_y_dilation,
-                                 lhs_x_dilation, padding_y_before,
-                                 padding_y_after, padding_x_before,
-                                 padding_x_after, static_cast<ScalarType>(0.0f))
-          .reshape(pre_contract_dims)
-          .contract(kernel.reshape(kernel_dims), contract_dims)
-          .reshape(post_contract_dims);
+  kernel_dims[1] = feature_group_count;
+  kernel_dims[2] = kernel_filters / feature_group_count;
+
+  for (Eigen::Index i = 0; i < feature_group_count; ++i) {
+    // The row and column dimensions must be flipped when passed to Eigen.
+    output.reshape(output_reshaped_dims).chip(i, 3).device(device) =
+        input.reshape(input_reshaped_dims)
+            .chip(i, 3)
+            .extract_image_patches(
+                kernel_y, kernel_x, y_stride, x_stride, rhs_y_dilation,
+                rhs_x_dilation, lhs_y_dilation, lhs_x_dilation,
+                padding_y_before, padding_y_after, padding_x_before,
+                padding_x_after, static_cast<ScalarType>(0.0f))
+            .reshape(pre_contract_dims)
+            .contract(kernel.reshape(kernel_dims).chip(i, 1), contract_dims)
+            .reshape(post_contract_dims);
+  }
 }
 
 template <typename EigenDevice, typename ScalarType>
@@ -103,7 +118,8 @@ void EigenConv3DImpl(
     Eigen::Index padding_z_before, Eigen::Index padding_z_after,
     Eigen::Index lhs_x_dilation, Eigen::Index lhs_y_dilation,
     Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation,
-    Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation) {
+    Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation,
+    Eigen::Index feature_group_count) {
   using ConstTType =
       Eigen::TensorMap<Eigen::Tensor<const ScalarType, 5, Eigen::RowMajor>,
                        Eigen::Aligned>;
@@ -117,42 +133,63 @@ void EigenConv3DImpl(
                    Eigen::Aligned>
       output(out, input_batch, output_x, output_y, output_z, kernel_filters);
 
-  Eigen::array<Eigen::IndexPair<int64_t>, 1> contract_dims;
-  contract_dims[0] = Eigen::IndexPair<int64_t>(1, 0);
+  Eigen::DSizes<Eigen::Index, 6> input_reshaped_dims;
+  input_reshaped_dims[0] = input_batch;
+  input_reshaped_dims[1] = input_x;
+  input_reshaped_dims[2] = input_y;
+  input_reshaped_dims[3] = input_z;
+  input_reshaped_dims[4] = feature_group_count;
+  input_reshaped_dims[5] = input_channels / feature_group_count;
+
+  Eigen::DSizes<Eigen::Index, 6> output_reshaped_dims;
+  output_reshaped_dims[0] = input_batch;
+  output_reshaped_dims[1] = output_x;
+  output_reshaped_dims[2] = output_y;
+  output_reshaped_dims[3] = output_z;
+  output_reshaped_dims[4] = feature_group_count;
+  output_reshaped_dims[5] = kernel_filters / feature_group_count;
+
+  Eigen::array<Eigen::IndexPair<Eigen::Index>, 1> contract_dims;
+  contract_dims[0] = Eigen::IndexPair<Eigen::Index>(1, 0);
 
   // Molds the output of the patch extraction code into a 2d tensor:
   // - the first dimension (dims[0]): the patch values to be multiplied with the
   //   kernels
   // - the second dimension (dims[1]): everything else
-  Eigen::DSizes<int64_t, 2> pre_contract_dims;
+  Eigen::DSizes<Eigen::Index, 2> pre_contract_dims;
   pre_contract_dims[0] = output_x * output_y * output_z * input_batch;
   pre_contract_dims[1] = kernel_channels * kernel_x * kernel_y * kernel_z;
 
   // Molds the output of the contraction into the shape expected by the user:
-  Eigen::DSizes<int64_t, 5> post_contract_dims;
+  Eigen::DSizes<Eigen::Index, 5> post_contract_dims;
   post_contract_dims[0] = input_batch;
   post_contract_dims[1] = output_x;
   post_contract_dims[2] = output_y;
   post_contract_dims[3] = output_z;
-  post_contract_dims[4] = kernel_filters;
+  post_contract_dims[4] = kernel_filters / feature_group_count;
 
-  Eigen::DSizes<int64_t, 2> kernel_dims;
+  Eigen::DSizes<Eigen::Index, 3> kernel_dims;
   kernel_dims[0] = kernel_channels * kernel_x * kernel_y * kernel_z;
-  kernel_dims[1] = kernel_filters;
-
-  // The dimension order must be flipped when passed to Eigen.
-  auto patches = Eigen::TensorVolumePatchOp<Eigen::Dynamic, Eigen::Dynamic,
-                                            Eigen::Dynamic, const ConstTType>(
-      input, kernel_z, kernel_y, kernel_x, z_stride, y_stride, x_stride,
-      rhs_z_dilation, rhs_y_dilation, rhs_x_dilation, lhs_z_dilation,
-      lhs_y_dilation, lhs_x_dilation, padding_z_before, padding_z_after,
-      padding_y_before, padding_y_after, padding_x_before, padding_x_after,
-      static_cast<ScalarType>(0.0f));
-
-  output.device(device) =
-      patches.reshape(pre_contract_dims)
-          .contract(kernel.reshape(kernel_dims), contract_dims)
-          .reshape(post_contract_dims);
+  kernel_dims[1] = feature_group_count;
+  kernel_dims[2] = kernel_filters / feature_group_count;
+
+  for (Eigen::Index i = 0; i < feature_group_count; ++i) {
+    // The dimension order must be flipped when passed to Eigen.
+    auto input_chip = input.reshape(input_reshaped_dims).chip(i, 4);
+    auto patches =
+        Eigen::TensorVolumePatchOp<Eigen::Dynamic, Eigen::Dynamic,
+                                   Eigen::Dynamic, decltype(input_chip)>(
+            input_chip, kernel_z, kernel_y, kernel_x, z_stride, y_stride,
+            x_stride, rhs_z_dilation, rhs_y_dilation, rhs_x_dilation,
+            lhs_z_dilation, lhs_y_dilation, lhs_x_dilation, padding_z_before,
+            padding_z_after, padding_y_before, padding_y_after,
+            padding_x_before, padding_x_after, static_cast<ScalarType>(0.0f));
+
+    output.reshape(output_reshaped_dims).chip(i, 4).device(device) =
+        patches.reshape(pre_contract_dims)
+            .contract(kernel.reshape(kernel_dims).chip(i, 1), contract_dims)
+            .reshape(post_contract_dims);
+  }
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.cc
index a2e8c7006db6c4..95e908c9b614d3 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.cc
@@ -29,13 +29,14 @@ __xla_cpu_runtime_EigenSingleThreadedConv2DF16(
     int64_t col_stride, int64_t padding_top, int64_t padding_bottom,
     int64_t padding_left, int64_t padding_right, int64_t lhs_row_dilation,
     int64_t lhs_col_dilation, int64_t rhs_row_dilation,
-    int64_t rhs_col_dilation) {
+    int64_t rhs_col_dilation, int64_t feature_group_count) {
   tensorflow::xla::EigenConv2DImpl(
       Eigen::DefaultDevice(), out, lhs, rhs, input_batch, input_rows,
       input_cols, input_channels, kernel_rows, kernel_cols, kernel_channels,
       kernel_filters, output_rows, output_cols, row_stride, col_stride,
       padding_top, padding_bottom, padding_left, padding_right,
-      lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation);
+      lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
+      feature_group_count);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void
@@ -47,11 +48,13 @@ __xla_cpu_runtime_EigenSingleThreadedConv2DF32(
     int64_t output_cols, int64_t row_stride, int64_t col_stride,
     int64_t padding_top, int64_t padding_bottom, int64_t padding_left,
     int64_t padding_right, int64_t lhs_row_dilation, int64_t lhs_col_dilation,
-    int64_t rhs_row_dilation, int64_t rhs_col_dilation) {
+    int64_t rhs_row_dilation, int64_t rhs_col_dilation,
+    int64_t feature_group_count) {
   tensorflow::xla::EigenConv2DImpl(
       Eigen::DefaultDevice(), out, lhs, rhs, input_batch, input_rows,
       input_cols, input_channels, kernel_rows, kernel_cols, kernel_channels,
       kernel_filters, output_rows, output_cols, row_stride, col_stride,
       padding_top, padding_bottom, padding_left, padding_right,
-      lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation);
+      lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
+      feature_group_count);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h
index e2517a32fa1b55..88349890fa49b5 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h
@@ -30,7 +30,8 @@ extern void __xla_cpu_runtime_EigenSingleThreadedConv2DF16(
     int64_t row_stride, int64_t col_stride, int64_t padding_top,
     int64_t padding_bottom, int64_t padding_left, int64_t padding_right,
     int64_t lhs_row_dilation, int64_t lhs_col_dilation,
-    int64_t rhs_row_dilation, int64_t rhs_col_dilation);
+    int64_t rhs_row_dilation, int64_t rhs_col_dilation,
+    int64_t feature_group_count);
 
 extern void __xla_cpu_runtime_EigenSingleThreadedConv2DF32(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
@@ -41,7 +42,7 @@ extern void __xla_cpu_runtime_EigenSingleThreadedConv2DF32(
     int64_t col_stride, int64_t padding_top, int64_t padding_bottom,
     int64_t padding_left, int64_t padding_right, int64_t lhs_row_dilation,
     int64_t lhs_col_dilation, int64_t rhs_row_dilation,
-    int64_t rhs_col_dilation);
+    int64_t rhs_col_dilation, int64_t feature_group_count);
 
 }  // extern "C"
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv3d.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv3d.cc
index 31eb5f6ef9c0ce..02337bcb55889d 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv3d.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv3d.cc
@@ -30,7 +30,8 @@ __xla_cpu_runtime_EigenSingleThreadedConv3DF32(
     int64_t padding_x_after, int64_t padding_y_before, int64_t padding_y_after,
     int64_t padding_z_before, int64_t padding_z_after, int64_t lhs_x_dilation,
     int64_t lhs_y_dilation, int64_t lhs_z_dilation, int64_t rhs_x_dilation,
-    int64_t rhs_y_dilation, int64_t rhs_z_dilation) {
+    int64_t rhs_y_dilation, int64_t rhs_z_dilation,
+    int64_t feature_group_count) {
   tensorflow::xla::EigenConv3DImpl(
       Eigen::DefaultDevice(), out, lhs, rhs, input_batch, input_x, input_y,
       input_z, input_channels, kernel_x, kernel_y, kernel_z, kernel_channels,
@@ -38,7 +39,7 @@ __xla_cpu_runtime_EigenSingleThreadedConv3DF32(
       z_stride, padding_x_before, padding_x_after, padding_y_before,
       padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
       lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
-      rhs_z_dilation);
+      rhs_z_dilation, feature_group_count);
 }
 
 TF_ATTRIBUTE_NO_SANITIZE_MEMORY void
@@ -52,7 +53,8 @@ __xla_cpu_runtime_EigenSingleThreadedConv3DF16(
     int64_t padding_x_after, int64_t padding_y_before, int64_t padding_y_after,
     int64_t padding_z_before, int64_t padding_z_after, int64_t lhs_x_dilation,
     int64_t lhs_y_dilation, int64_t lhs_z_dilation, int64_t rhs_x_dilation,
-    int64_t rhs_y_dilation, int64_t rhs_z_dilation) {
+    int64_t rhs_y_dilation, int64_t rhs_z_dilation,
+    int64_t feature_group_count) {
   tensorflow::xla::EigenConv3DImpl(
       Eigen::DefaultDevice(), out, lhs, rhs, input_batch, input_x, input_y,
       input_z, input_channels, kernel_x, kernel_y, kernel_z, kernel_channels,
@@ -60,5 +62,5 @@ __xla_cpu_runtime_EigenSingleThreadedConv3DF16(
       z_stride, padding_x_before, padding_x_after, padding_y_before,
       padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
       lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
-      rhs_z_dilation);
+      rhs_z_dilation, feature_group_count);
 }
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv3d.h b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv3d.h
index 0125b43200aa8f..c4958f81845983 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv3d.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv3d.h
@@ -32,7 +32,7 @@ extern void __xla_cpu_runtime_EigenSingleThreadedConv3DF16(
     int64_t padding_y_before, int64_t padding_y_after, int64_t padding_z_before,
     int64_t padding_z_after, int64_t lhs_x_dilation, int64_t lhs_y_dilation,
     int64_t lhs_z_dilation, int64_t rhs_x_dilation, int64_t rhs_y_dilation,
-    int64_t rhs_z_dilation);
+    int64_t rhs_z_dilation, int64_t feature_group_count);
 
 extern void __xla_cpu_runtime_EigenSingleThreadedConv3DF32(
     const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
@@ -44,7 +44,8 @@ extern void __xla_cpu_runtime_EigenSingleThreadedConv3DF32(
     int64_t padding_x_before, int64_t padding_x_after, int64_t padding_y_before,
     int64_t padding_y_after, int64_t padding_z_before, int64_t padding_z_after,
     int64_t lhs_x_dilation, int64_t lhs_y_dilation, int64_t lhs_z_dilation,
-    int64_t rhs_x_dilation, int64_t rhs_y_dilation, int64_t rhs_z_dilation);
+    int64_t rhs_x_dilation, int64_t rhs_y_dilation, int64_t rhs_z_dilation,
+    int64_t feature_group_count);
 
 }  // extern "C"
 
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
index 6da0da558b5528..9ad6ceea3da8fc 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
@@ -211,7 +211,8 @@ llvm::Value* VectorSupportLibrary::ComputeOffsetPointer(
     base_pointer =
         b()->CreateBitCast(base_pointer, scalar_pointer_type(), name());
   }
-  return b()->CreateInBoundsGEP(base_pointer, {offset_elements}, name());
+  return b()->CreateInBoundsGEP(scalar_type(), base_pointer, offset_elements,
+                                name());
 }
 
 llvm::Value* VectorSupportLibrary::LoadVector(llvm::Value* pointer) {
@@ -219,8 +220,8 @@ llvm::Value* VectorSupportLibrary::LoadVector(llvm::Value* pointer) {
     pointer = b()->CreateBitCast(pointer, vector_pointer_type(), name());
   }
   return b()->CreateAlignedLoad(
-      pointer, llvm::Align(ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_)),
-      name());
+      vector_type(), pointer,
+      llvm::Align(ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_)), name());
 }
 
 llvm::Value* VectorSupportLibrary::LoadScalar(llvm::Value* pointer) {
@@ -228,8 +229,8 @@ llvm::Value* VectorSupportLibrary::LoadScalar(llvm::Value* pointer) {
     pointer = b()->CreateBitCast(pointer, scalar_pointer_type(), name());
   }
   return b()->CreateAlignedLoad(
-      pointer, llvm::Align(ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_)),
-      name());
+      scalar_type(), pointer,
+      llvm::Align(ShapeUtil::ByteSizeOfPrimitiveType(primitive_type_)), name());
 }
 
 void VectorSupportLibrary::StoreVector(llvm::Value* value,
@@ -258,8 +259,8 @@ llvm::Value* VectorSupportLibrary::LoadBroadcast(llvm::Value* pointer) {
   if (pointer->getType() != scalar_pointer_type()) {
     pointer = b()->CreateBitCast(pointer, scalar_pointer_type(), name());
   }
-  return b()->CreateVectorSplat(vector_size(), b()->CreateLoad(pointer),
-                                name());
+  return b()->CreateVectorSplat(
+      vector_size(), b()->CreateLoad(scalar_type(), pointer), name());
 }
 
 llvm::Value* VectorSupportLibrary::AddReduce(llvm::Value* vector) {
@@ -420,7 +421,9 @@ LlvmVariable::LlvmVariable(llvm::Type* type, llvm::IRBuilder<>* b) : b_(b) {
   alloca_ = llvm_ir::EmitAllocaAtFunctionEntry(type, "", b_);
 }
 
-llvm::Value* LlvmVariable::Get() const { return b_->CreateLoad(alloca_); }
+llvm::Value* LlvmVariable::Get() const {
+  return b_->CreateLoad(alloca_->getType()->getPointerElementType(), alloca_);
+}
 
 void LlvmVariable::Set(llvm::Value* new_value) {
   b_->CreateStore(new_value, alloca_);
diff --git a/tensorflow/compiler/xla/service/dump.cc b/tensorflow/compiler/xla/service/dump.cc
index 4b9c46068eace7..3a4f7a3ced6df5 100644
--- a/tensorflow/compiler/xla/service/dump.cc
+++ b/tensorflow/compiler/xla/service/dump.cc
@@ -15,8 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/dump.h"
 
+#include <memory>
+
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
+#include "mlir/Transforms/LocationSnapshot.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_proto_util.h"
@@ -169,8 +172,9 @@ struct CanonicalDebugOptions {
   bool dump_hlo_metadata;
 };
 
-Status WriteStringToFile(tensorflow::Env* env, const string& fname,
-                         const tensorflow::StringPiece& data, bool compressed) {
+static Status WriteStringToFile(tensorflow::Env* env, const string& fname,
+                                const tensorflow::StringPiece& data,
+                                bool compressed) {
   if (!compressed) {
     return tensorflow::WriteStringToFile(env, fname, data);
   }
@@ -185,9 +189,8 @@ Status WriteStringToFile(tensorflow::Env* env, const string& fname,
   return gz_file.Close();
 }
 
-absl::optional<std::string> DumpToFileInDirImpl(
-    string_view filename, string_view contents,
-    const CanonicalDebugOptions& opts, bool compress = false) {
+static absl::optional<std::string> GetDumpFilePath(
+    string_view filename, const CanonicalDebugOptions& opts) {
   if (opts.dumping_to_stdout()) {
     LOG(ERROR) << "Refusing to write " << filename
                << " to stdout.  Pass --xla_dump_to=<path> to write to a file.";
@@ -244,18 +247,27 @@ absl::optional<std::string> DumpToFileInDirImpl(
     }
   }
 
-  string file_path =
-      tensorflow::io::JoinPath(dir, SanitizeFileName(string(filename)));
-  auto status = WriteStringToFile(env, file_path, contents, compress);
+  return tensorflow::io::JoinPath(dir, SanitizeFileName(string(filename)));
+}
+
+static absl::optional<std::string> DumpToFileInDirImpl(
+    string_view filename, string_view contents,
+    const CanonicalDebugOptions& opts, bool compress = false) {
+  auto file_path = GetDumpFilePath(filename, opts);
+  if (!file_path) return absl::nullopt;
+
+  auto status = WriteStringToFile(tensorflow::Env::Default(), *file_path,
+                                  contents, compress);
   if (!status.ok()) {
-    LOG(ERROR) << "Could not write XLA debug data to " << file_path << ": "
+    LOG(ERROR) << "Could not write XLA debug data to " << *file_path << ": "
                << status;
+    return absl::nullopt;
   }
 
   return file_path;
 }
 
-absl::optional<std::string> DumpToFileInDirOrStdoutImpl(
+static absl::optional<std::string> DumpToFileInDirOrStdoutImpl(
     string_view filename, string_view contents,
     const CanonicalDebugOptions& opts) {
   // Dump to stdout if that's called for.
@@ -270,12 +282,10 @@ absl::optional<std::string> DumpToFileInDirOrStdoutImpl(
 }
 
 // Returns full file paths of all dumps of the module.
-std::vector<std::string> DumpHloModuleImpl(const HloModule& module,
-                                           const BufferAssignment* buffer_assn,
-                                           const HloExecutionProfile* profile,
-                                           string_view prefix,
-                                           string_view suffix,
-                                           const CanonicalDebugOptions& opts) {
+static std::vector<std::string> DumpHloModuleImpl(
+    const HloModule& module, const BufferAssignment* buffer_assn,
+    const HloExecutionProfile* profile, string_view prefix, string_view suffix,
+    const CanonicalDebugOptions& opts) {
   string filename = FilenameFor(module, prefix, suffix);
 
   std::vector<absl::optional<std::string>> file_paths;
@@ -368,9 +378,9 @@ std::vector<std::string> DumpHloModuleImpl(const HloModule& module,
   return dumped_file_paths;
 }
 
-void DumpHloModuleMetadata(const HloModuleMetadataProto& metadata,
-                           const CanonicalDebugOptions& opts,
-                           absl::flat_hash_set<int64_t>* dumped_module_ids) {
+static void DumpHloModuleMetadata(
+    const HloModuleMetadataProto& metadata, const CanonicalDebugOptions& opts,
+    absl::flat_hash_set<int64_t>* dumped_module_ids) {
   // Return if metadata for this module has already been dumped.
   if (!dumped_module_ids->insert(metadata.canonical_module_id()).second) {
     return;
@@ -477,6 +487,21 @@ void DumpToFileInDirOrStdout(const DebugOptions& debug_options, int unique_id,
       CanonicalDebugOptions(debug_options));
 }
 
+void DumpToFileInDirOrStdout(const HloModule& module, string_view file_prefix,
+                             mlir::Operation* op) {
+  CanonicalDebugOptions opts(module.config().debug_options());
+  if (opts.dumping_to_stdout()) return op->dump();
+
+  auto file_path =
+      GetDumpFilePath(FilenameFor(module, file_prefix, "mlir"), opts);
+  if (!file_path) return;
+
+  // TODO(csigg): Change tag to file_prefix once BEF handles fused locs.
+  llvm::StringRef tag = "";
+  if (failed(mlir::generateLocationsFromIR(*file_path, tag, op, llvm::None)))
+    LOG(ERROR) << "Failed to dump op to " << *file_path;
+}
+
 void DumpExecutionOptions(const ExecutionOptions& execution_options,
                           const DebugOptions& debug_options) {
   CanonicalDebugOptions opts(debug_options);
diff --git a/tensorflow/compiler/xla/service/dump.h b/tensorflow/compiler/xla/service/dump.h
index 60037a30227465..2a2c33ee56172e 100644
--- a/tensorflow/compiler/xla/service/dump.h
+++ b/tensorflow/compiler/xla/service/dump.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_DUMP_H_
 
 #include "absl/strings/string_view.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
@@ -66,6 +67,14 @@ void DumpToFileInDirOrStdout(const DebugOptions& debug_options, int unique_id,
                              absl::string_view file_suffix,
                              absl::string_view contents);
 
+// Writes the given op to a file in the xla_dump_to directory specified by
+// module's DebugOptions. Sets the op's source locations to that file.
+//
+// If module doesn't have an xla_dump_to directory, does nothing.
+void DumpToFileInDirOrStdout(const HloModule& module,
+                             absl::string_view file_prefix,
+                             mlir::Operation* op);
+
 // Dumps the given execution options if dumping is enabled. Exactly
 // where and in what formats it's dumped is determined by the debug options.
 void DumpExecutionOptions(const ExecutionOptions& execution_options,
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index d67a95304d7374..e1f23f0f6e9257 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -2057,43 +2057,44 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalConcatenate(
 
   std::function<llvm::BasicBlock*(
       absl::Span<const std::pair<int64_t, const HloInstruction*>> operands)>
-      emit_tree = [&](absl::Span<
-                      const std::pair<int64_t, const HloInstruction*>>
-                          operands) {
-        llvm::IRBuilder<>::InsertPointGuard guard(*b_);
-        size_t mid = operands.size() / 2;
-        const std::pair<int64_t, const HloInstruction*>& pivot = operands[mid];
-        llvm::BasicBlock* block = llvm_ir::CreateBasicBlock(
-            exit_block, absl::StrCat("concatenate.pivot.", pivot.first, "."),
-            b_);
-        b_->SetInsertPoint(block);
-
-        // If there's only one element we're done. The range is contiguous so we
-        // can just jump to the block for it.
-        if (operands.size() == 1) {
-          const std::pair<int64_t, const HloInstruction*>& operand =
-              operands.back();
-          int64_t operand_id = to_unique_operand_id[operand.second];
-
-          source_index_phis[operand_id]->addIncoming(
-              source_index.GetConstantWithIndexType(operand.first),
-              b_->GetInsertBlock());
-          b_->CreateBr(emit_operand_blocks[operand_id]);
-          return block;
-        }
-
-        // Take the middle element and recurse.
-        llvm::Constant* pivot_const = llvm::ConstantInt::get(
-            source_index[concat_dim]->getType(), pivot.first);
-        llvm::Value* comp =
-            b_->CreateICmpULT(source_index[concat_dim], pivot_const);
-
-        llvm::BasicBlock* left_block = emit_tree(operands.subspan(0, mid));
-        llvm::BasicBlock* right_block = emit_tree(operands.subspan(mid));
-
-        b_->CreateCondBr(comp, left_block, right_block);
-        return block;
-      };
+      emit_tree =
+          [&](absl::Span<const std::pair<int64_t, const HloInstruction*>>
+                  operands) {
+            llvm::IRBuilder<>::InsertPointGuard guard(*b_);
+            size_t mid = operands.size() / 2;
+            const std::pair<int64_t, const HloInstruction*>& pivot =
+                operands[mid];
+            llvm::BasicBlock* block = llvm_ir::CreateBasicBlock(
+                exit_block,
+                absl::StrCat("concatenate.pivot.", pivot.first, "."), b_);
+            b_->SetInsertPoint(block);
+
+            // If there's only one element we're done. The range is contiguous
+            // so we can just jump to the block for it.
+            if (operands.size() == 1) {
+              const std::pair<int64_t, const HloInstruction*>& operand =
+                  operands.back();
+              int64_t operand_id = to_unique_operand_id[operand.second];
+
+              source_index_phis[operand_id]->addIncoming(
+                  source_index.GetConstantWithIndexType(operand.first),
+                  b_->GetInsertBlock());
+              b_->CreateBr(emit_operand_blocks[operand_id]);
+              return block;
+            }
+
+            // Take the middle element and recurse.
+            llvm::Constant* pivot_const = llvm::ConstantInt::get(
+                source_index[concat_dim]->getType(), pivot.first);
+            llvm::Value* comp =
+                b_->CreateICmpULT(source_index[concat_dim], pivot_const);
+
+            llvm::BasicBlock* left_block = emit_tree(operands.subspan(0, mid));
+            llvm::BasicBlock* right_block = emit_tree(operands.subspan(mid));
+
+            b_->CreateCondBr(comp, left_block, right_block);
+            return block;
+          };
 
   Br(emit_tree(cases));
 
@@ -3068,6 +3069,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitConvolution(
     const HloInstruction* convolution,
     const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
     const llvm_ir::IrArray::Index& index) {
+  TF_RET_CHECK(convolution->batch_group_count() == 1);
   const HloInstruction* lhs = convolution->operand(0);
   const auto& input_generator = operand_to_generator.at(lhs);
   const HloInstruction* rhs = convolution->operand(1);
@@ -3107,14 +3109,21 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitConvolution(
                 absl::StrCat("k", i))
             ->GetIndVarValue();
   }
+  const int64_t input_group_size =
+      rhs->shape().dimensions(dnums.kernel_input_feature_dimension());
+  const int64_t feature_group_count = convolution->feature_group_count();
+  const int64_t output_group_size =
+      rhs->shape().dimensions(dnums.kernel_output_feature_dimension()) /
+      feature_group_count;
   llvm::Value* input_feature =
-      loops
-          .AddLoop(0, lhs->shape().dimensions(dnums.input_feature_dimension()),
-                   "iz")
-          ->GetIndVarValue();
+      loops.AddLoop(0, input_group_size, "iz")->GetIndVarValue();
 
   SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), b_);
 
+  llvm::Value* group_id = SDiv(output_feature, b_->getInt64(output_group_size));
+  llvm::Value* lhs_input_feature =
+      NSWAdd(input_feature, NSWMul(group_id, b_->getInt64(input_group_size)));
+
   // Calculate the spatial index in the input array, taking striding, dilation
   // and padding into account. An index in the padding will be out of the bounds
   // of the array.
@@ -3180,7 +3189,7 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitConvolution(
   for (int i = 0; i < num_spatial_dims; ++i) {
     input_multi_index[dnums.input_spatial_dimensions(i)] = input_spatial[i];
   }
-  input_multi_index[dnums.input_feature_dimension()] = input_feature;
+  input_multi_index[dnums.input_feature_dimension()] = lhs_input_feature;
   input_multi_index[dnums.input_batch_dimension()] = batch;
 
   std::vector<llvm::Value*> kernel_multi_index(num_dims);
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 2ea7db42d0237d..aebde2127eb4dd 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -859,6 +859,7 @@ cc_library(
             "@tf_runtime//:hostcontext",
             "@tf_runtime//:mlirtobef_translate",
             "@tf_runtime//:support",
+            "@tf_runtime//backends/gpu:gpu_executor",
             "@tf_runtime//backends/gpu:gpu_kernels_alwayslink",
             "@tf_runtime//backends/gpu:gpu_opdefs",
             "@tf_runtime//backends/gpu:gpu_passes",
@@ -866,6 +867,7 @@ cc_library(
         ],
         ":is_bef_executable_enabled": [
             ":xlir_kernels",
+            "@llvm-project//llvm:Support",
             "//tensorflow/core/tfrt/runtime:work_queue_interface",
             "//tensorflow/stream_executor/cuda:cuda_driver",
             "//tensorflow/stream_executor/gpu:gpu_executor_header",
@@ -876,6 +878,7 @@ cc_library(
             "@tf_runtime//:hostcontext",
             "@tf_runtime//:support",
             "@tf_runtime//:tensor_alwayslink",
+            "@tf_runtime//backends/gpu:gpu_executor",
             "@tf_runtime//backends/gpu:gpu_kernels_alwayslink",
             "@tf_runtime//backends/gpu:gpu_passes",
             "@tf_runtime//backends/gpu:gpu_types",
@@ -2585,18 +2588,16 @@ test_suite(
         # Disabled: Repeated Kernel BefThunk invocations prolong test duration.
         # E.g., it takes ~1300s for 256x128 matrix.
         # "//tensorflow/compiler/xla/client/lib:svd_test_gpu",
+        "//tensorflow/compiler/xla/service/gpu:cudnn_fused_conv_rewriter_test",
         "//tensorflow/compiler/xla/service/gpu:custom_call_test",
         "//tensorflow/compiler/xla/service/gpu/tests:gemm_broadcast_folding_rewrite_test",
         "//tensorflow/compiler/xla/service/gpu/tests:gpu_copy_test",
         "//tensorflow/compiler/xla/service/gpu/tests:kernel_launch_test",
         "//tensorflow/compiler/xla/service/gpu/tests:mlir_gemm_test",
-        # TODO(b/187959015): Enable when complex number types are supported.
-        #"//tensorflow/compiler/xla/tests:cholesky_test_gpu",
-        # TODO(b/187959015): Enable when complex number types are supported.
-        #"//tensorflow/compiler/xla/tests:dot_operation_test_gpu",
+        "//tensorflow/compiler/xla/tests:cholesky_test_gpu",
+        "//tensorflow/compiler/xla/tests:dot_operation_test_gpu",
         "//tensorflow/compiler/xla/tests:multioutput_fusion_test_gpu",
-        # TODO(b/187959015): Enable when complex number types are supported.
-        # "//tensorflow/compiler/xla/tests:triangular_solve_test_gpu",
+        "//tensorflow/compiler/xla/tests:triangular_solve_test_gpu",
         "//tensorflow/compiler/xla/tests:while_test_gpu",
     ],
 )
@@ -2604,6 +2605,7 @@ test_suite(
 test_suite(
     name = "bef_executable_tests",
     tests = [
+        "//tensorflow/compiler/xla/service/gpu:cudnn_fused_conv_rewriter_test",
         "//tensorflow/compiler/xla/service/gpu:custom_call_test",
         "//tensorflow/compiler/xla/service/gpu/tests:add_preds.hlo.test",
         "//tensorflow/compiler/xla/service/gpu/tests:all_reduce.hlo.test",
@@ -2650,6 +2652,7 @@ test_suite(
         "//tensorflow/compiler/xla/service/gpu/tests:sorting.hlo.test",
         "//tensorflow/compiler/xla/service/gpu/tests:sorting_test",
         "//tensorflow/compiler/xla/service/gpu/tests:tree_reduction_rewriter_test",
+        "//tensorflow/compiler/xla/tests:cholesky_test_gpu",
         "//tensorflow/compiler/xla/tests:multioutput_fusion_test_gpu",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/gpu/bef_thunk.cc b/tensorflow/compiler/xla/service/gpu/bef_thunk.cc
index c849db650235d3..0f21e1b53ee853 100644
--- a/tensorflow/compiler/xla/service/gpu/bef_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/bef_thunk.cc
@@ -21,8 +21,10 @@ limitations under the License.
 
 #if BEF_THUNKS
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/SourceMgr.h"
 #include "mlir/Dialect/GPU/Passes.h"  // from @llvm-project
 #include "mlir/IR/BlockAndValueMapping.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/lhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/lhlo_gpu/IR/lhlo_gpu_ops.h"
@@ -41,6 +43,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/device_memory.h"
 #include "tensorflow/stream_executor/gpu/gpu_executor.h"
 #include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tfrt/gpu/gpu_executor.h"  // from @tf_runtime
 #include "tfrt/gpu/gpu_types.h"  // from @tf_runtime
 #include "tfrt/gpu/kernels/gpu_ops.h"  // from @tf_runtime
 #include "tfrt/gpu/passes/passes.h"  // from @tf_runtime
@@ -69,6 +72,7 @@ bool IsBefThunkEnabled() { return true; }
 namespace {
 
 struct CoreRuntimeAndWorkQueue {
+  mlir::MLIRContext* mlir_ctx;
   tfrt::CoreRuntime* core_runtime;
   tensorflow::tfrt_stub::WorkQueueInterface* work_queue;
 };
@@ -254,44 +258,34 @@ static StatusOr<Thunk::Kind> GetThunkKind(mlir::Operation* op) {
 }
 
 static StatusOr<CoreRuntimeAndWorkQueue> GetCoreRuntimeAndWorkQueue() {
-  // TODO(hanbinyoon): Make these configurable.
-  int tfrt_num_threads = tensorflow::port::MaxParallelism();
-  int tfrt_num_blocking_threads = 16;
-
-  static StatusOr<CoreRuntimeAndWorkQueue>* runtime_and_queue_or =
-      [&](int num_threads, int num_blocking_threads) {
-        // Create work queue.
-        auto work_queue = tensorflow::tfrt_stub::WrapDefaultWorkQueue(
-            tfrt::CreateMultiThreadedWorkQueue(num_threads,
-                                               num_blocking_threads));
-        if (work_queue == nullptr) {
-          auto status =
-              tensorflow::errors::Internal("Failed to create TFRT work queue.");
-          return new StatusOr<CoreRuntimeAndWorkQueue>(status);
-        }
-        auto* work_queue_ptr = work_queue.get();
-
-        // Create core runtime.
-        auto expected_core_runtime = tfrt::CoreRuntime::Create(
-            [](const tfrt::DecodedDiagnostic& diag) {
-              LOG(ERROR) << tfrt::StrCat(diag);
-            },
-            tfrt::CreateMallocAllocator(), std::move(work_queue),
-            kDefaultHostDeviceName);
-        if (!expected_core_runtime) {
-          auto error = expected_core_runtime.takeError();
-          auto status =
-              tensorflow::errors::Internal(llvm::toString(std::move(error)));
-          return new StatusOr<CoreRuntimeAndWorkQueue>(status);
-        }
-
-        auto runtime_and_queue = CoreRuntimeAndWorkQueue{
-            expected_core_runtime->release(), work_queue_ptr};
-        return new StatusOr<CoreRuntimeAndWorkQueue>(runtime_and_queue);
-      }(tfrt_num_threads, tfrt_num_blocking_threads);
-
-  TF_RETURN_IF_ERROR(runtime_and_queue_or->status());
-  return runtime_and_queue_or->ValueOrDie();
+  static auto runtime_and_queue_or =
+      [&]() -> StatusOr<CoreRuntimeAndWorkQueue> {
+    // TODO(hanbinyoon): Make these configurable.
+    int num_threads = tensorflow::port::MaxParallelism();
+    int num_blocking_threads = 16;
+
+    // Create work queue.
+    auto work_queue = tensorflow::tfrt_stub::WrapDefaultWorkQueue(
+        tfrt::CreateMultiThreadedWorkQueue(num_threads, num_blocking_threads));
+    if (work_queue == nullptr) {
+      return tensorflow::errors::Internal("Failed to create TFRT work queue.");
+    }
+    auto* work_queue_ptr = work_queue.get();
+    auto* mlir_ctx = new mlir::MLIRContext;
+
+    // Create core runtime.
+    auto expected_core_runtime = tfrt::CoreRuntime::Create(
+        tfrt::gpu::GetDiagHandler(mlir_ctx), tfrt::CreateMallocAllocator(),
+        std::move(work_queue), kDefaultHostDeviceName);
+    if (!expected_core_runtime) {
+      auto error = expected_core_runtime.takeError();
+      return tensorflow::errors::Internal(llvm::toString(std::move(error)));
+    }
+
+    return CoreRuntimeAndWorkQueue{mlir_ctx, expected_core_runtime->release(),
+                                   work_queue_ptr};
+  }();
+  return runtime_and_queue_or;
 }
 
 // Creates a TFRT module that loads the GPU module and launches the target
@@ -673,6 +667,14 @@ Status BefThunk::ExecuteOnStream(const ExecuteParams& params) {
   if (function->num_results() != 1)
     return tensorflow::errors::Internal("Unexpected result count.");
 
+  TF_ASSIGN_OR_RETURN(auto runtime_and_queue, GetCoreRuntimeAndWorkQueue());
+  // Capture errors and augment with source.
+  std::string diag_str;
+  llvm::raw_string_ostream diag_os(diag_str);
+  llvm::SourceMgr src_mgr;
+  mlir::SourceMgrDiagnosticHandler handler(src_mgr, runtime_and_queue.mlir_ctx,
+                                           diag_os);
+
   // Execute the function.
   function->Execute(*exec_ctx, args, {result});
 
@@ -688,7 +690,8 @@ Status BefThunk::ExecuteOnStream(const ExecuteParams& params) {
   }
 #endif  // XLA_ENABLE_XCCL
 
-  // Report error if any.
+  // Report error if any, from handler and result.
+  if (diag_os.tell()) return tensorflow::errors::Internal(diag_os.str());
   if (auto* error = result->GetErrorIfPresent())
     return tensorflow::errors::Internal(tfrt::StrCat(*error));
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 70244e7b2137e5..db0df893212833 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "mlir/Transforms/LocationSnapshot.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/utils/name_utils.h"
 #include "tensorflow/compiler/mlir/xla/hlo_utils.h"
@@ -760,10 +761,7 @@ static StatusOr<OwnedBefBuffer> LowerToBef(mlir::ModuleOp mlir_module,
   TF_RETURN_IF_ERROR(tensorflow::ConvertLmhloToTfrtGpuWithBinary(mlir_module));
 
   if (DumpingEnabledForHloModule(*hlo_module)) {
-    std::string tfrt_mlir;
-    llvm::raw_string_ostream tfrt_mlir_ostream(tfrt_mlir);
-    mlir_module.print(tfrt_mlir_ostream);
-    DumpToFileInDirOrStdout(*hlo_module, "", "tfrt_mlir", tfrt_mlir);
+    DumpToFileInDirOrStdout(*hlo_module, "tfrt_gpu", mlir_module);
   }
 
   // TFRT Dialect -> BEF
@@ -774,7 +772,7 @@ static StatusOr<OwnedBefBuffer> LowerToBef(mlir::ModuleOp mlir_module,
   }
 
   if (DumpingEnabledForHloModule(*hlo_module)) {
-    DumpToFileInDirOrStdout(*hlo_module, "", "tfrt_bef", bef);
+    DumpToFileInDirOrStdout(*hlo_module, "", "bef", bef);
   }
 
   auto ptr = static_cast<uint8_t*>(
@@ -857,8 +855,9 @@ static Status CompileModuleToLlvmIrImpl(
 
   results->module_name = mlir::GetNameFromLoc(mlir_module->getLoc());
 
-  llvm_ir::DumpIrIfEnabled(mlir_module.get(), hlo_module->unique_id(),
-                           hlo_module->config().debug_options());
+  if (DumpingEnabledForHloModule(*hlo_module)) {
+    DumpToFileInDirOrStdout(*hlo_module, "lmhlo", mlir_module.get());
+  }
 
   auto entry_function = mlir::cast<mlir::FuncOp>(
       mlir_module->lookupSymbol(hlo_module->entry_computation()->name()));
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 9d389d33ba3553..d349022f84681a 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -47,10 +47,13 @@ limitations under the License.
 #include "tensorflow/stream_executor/platform.h"
 
 #if BEF_EXECUTABLE
+#include "llvm/Support/SourceMgr.h"
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
 #include "tensorflow/stream_executor/cuda/cuda_driver.h"
 #include "tensorflow/stream_executor/gpu/gpu_executor.h"
 #include "tensorflow/stream_executor/gpu/gpu_stream.h"
+#include "tfrt/gpu/gpu_executor.h"  // from @tf_runtime
 #include "tfrt/gpu/gpu_types.h"  // from @tf_runtime
 #include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
 #include "tfrt/bef_executor/bef_file.h"  // from @tf_runtime
@@ -465,6 +468,7 @@ static const char kDefaultHostDeviceName[] =
     "/job:localhost/replica:0/task:0/device:CPU:0";
 
 struct CoreRuntimeAndWorkQueue {
+  mlir::MLIRContext* mlir_ctx;
   tfrt::CoreRuntime* core_runtime;
   tensorflow::tfrt_stub::WorkQueueInterface* work_queue;
 };
@@ -472,44 +476,34 @@ struct CoreRuntimeAndWorkQueue {
 // TODO(hanbinyoon): Deduplicate with that in bef_thunk.cc when
 // tensorflow/core/tfrt/runtime is generally available in OSS.
 StatusOr<CoreRuntimeAndWorkQueue> GetCoreRuntimeAndWorkQueue() {
-  // TODO(hanbinyoon): Make these configurable.
-  int tfrt_num_threads = tensorflow::port::MaxParallelism();
-  int tfrt_num_blocking_threads = 16;
-
-  static StatusOr<CoreRuntimeAndWorkQueue>* runtime_and_queue_or =
-      [&](int num_threads, int num_blocking_threads) {
-        // Create work queue.
-        auto work_queue = tensorflow::tfrt_stub::WrapDefaultWorkQueue(
-            tfrt::CreateMultiThreadedWorkQueue(num_threads,
-                                               num_blocking_threads));
-        if (work_queue == nullptr) {
-          auto status =
-              tensorflow::errors::Internal("Failed to create TFRT work queue.");
-          return new StatusOr<CoreRuntimeAndWorkQueue>(status);
-        }
-        auto* work_queue_ptr = work_queue.get();
-
-        // Create core runtime.
-        auto expected_core_runtime = tfrt::CoreRuntime::Create(
-            [](const tfrt::DecodedDiagnostic& diag) {
-              LOG(ERROR) << tfrt::StrCat(diag);
-            },
-            tfrt::CreateMallocAllocator(), std::move(work_queue),
-            kDefaultHostDeviceName);
-        if (!expected_core_runtime) {
-          auto error = expected_core_runtime.takeError();
-          auto status =
-              tensorflow::errors::Internal(llvm::toString(std::move(error)));
-          return new StatusOr<CoreRuntimeAndWorkQueue>(status);
-        }
-
-        auto runtime_and_queue = CoreRuntimeAndWorkQueue{
-            expected_core_runtime->release(), work_queue_ptr};
-        return new StatusOr<CoreRuntimeAndWorkQueue>(runtime_and_queue);
-      }(tfrt_num_threads, tfrt_num_blocking_threads);
+  static auto runtime_and_queue_or =
+      [&]() -> StatusOr<CoreRuntimeAndWorkQueue> {
+    // TODO(hanbinyoon): Make these configurable.
+    int num_threads = tensorflow::port::MaxParallelism();
+    int num_blocking_threads = 16;
+
+    // Create work queue.
+    auto work_queue = tensorflow::tfrt_stub::WrapDefaultWorkQueue(
+        tfrt::CreateMultiThreadedWorkQueue(num_threads, num_blocking_threads));
+    if (work_queue == nullptr) {
+      return tensorflow::errors::Internal("Failed to create TFRT work queue.");
+    }
+    auto* work_queue_ptr = work_queue.get();
+    auto* mlir_ctx = new mlir::MLIRContext;
+
+    // Create core runtime.
+    auto expected_core_runtime = tfrt::CoreRuntime::Create(
+        tfrt::gpu::GetDiagHandler(mlir_ctx), tfrt::CreateMallocAllocator(),
+        std::move(work_queue), kDefaultHostDeviceName);
+    if (!expected_core_runtime) {
+      auto error = expected_core_runtime.takeError();
+      return tensorflow::errors::Internal(llvm::toString(std::move(error)));
+    }
 
-  TF_RETURN_IF_ERROR(runtime_and_queue_or->status());
-  return runtime_and_queue_or->ValueOrDie();
+    return CoreRuntimeAndWorkQueue{mlir_ctx, expected_core_runtime->release(),
+                                   work_queue_ptr};
+  }();
+  return runtime_and_queue_or;
 }
 
 // TODO(hanbinyoon): Deduplicate with that in bef_thunk.cc when
@@ -610,13 +604,21 @@ static Status ExecuteBef(const std::string& module_name,
   if (function->num_results() != 1)
     return InternalError("Unexpected result count.");
 
+  // Capture errors and augment with source.
+  std::string diag_str;
+  llvm::raw_string_ostream diag_os(diag_str);
+  llvm::SourceMgr src_mgr;
+  mlir::SourceMgrDiagnosticHandler handler(src_mgr, runtime_and_queue.mlir_ctx,
+                                           diag_os);
+
   // Execute the function.
   function->Execute(exec_ctx, args, {result});
 
   // Wait for async execution to complete.
   tfrt::Await(exec_ctx, llvm::makeArrayRef(result));
 
-  // Report error if any.
+  // Report error if any, from handler and result.
+  if (diag_os.tell()) return tensorflow::errors::Internal(diag_os.str());
   if (auto* error = result->GetErrorIfPresent())
     return tensorflow::errors::Internal(tfrt::StrCat(*error));
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 6f8cc92c611c26..2c1db783bc86ce 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -418,8 +418,9 @@ llvm::Value* EmitPrintf(absl::string_view fmt,
                                      /*isSigned=*/true);
     }
     builder->CreateStore(
-        value, builder->CreateGEP(arguments_ptr, {builder->getInt64(0),
-                                                  builder->getInt32(i)}));
+        value,
+        builder->CreateGEP(arguments_type, arguments_ptr,
+                           {builder->getInt64(0), builder->getInt32(i)}));
   }
   llvm::Type* ptr_ty = builder->getInt8Ty()->getPointerTo();
   return builder->CreateCall(
@@ -659,7 +660,7 @@ StatusOr<BufferAllocation::Slice> GetAllocationSlice(
         &allocations[GetAllocationIndex(
             view.source().cast<mlir::BlockArgument>(), constant_name)],
         mlir::cast<mlir::arith::ConstantOp>(view.byte_shift().getDefiningOp())
-            .value()
+            .getValue()
             .cast<mlir::IntegerAttr>()
             .getValue()
             .getSExtValue(),
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 3dec8d10271623..5ee876d4870afe 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -677,7 +677,8 @@ llvm::Value* IrEmitterUnnested::CreateLoad(llvm::Value* address,
   int data_bytes = data_type->getPrimitiveSizeInBits() /
                    primitive_util::BitWidth(PrimitiveType::U8);
   if (alignment_bytes == 0) {
-    return b_.CreateLoad(b_.CreateBitCast(address, data_type->getPointerTo()));
+    return b_.CreateLoad(data_type,
+                         b_.CreateBitCast(address, data_type->getPointerTo()));
   }
 
   int alignment_bitwidth =
@@ -896,7 +897,7 @@ Status IrEmitterUnnested::EmitSliceToDynamic(mlir::Operation* op) {
   for (int64_t i = 1; i < slice_to_dynamic.args().size(); ++i) {
     // const int64_t dim_index = i - 1;
     llvm::Value* source_buffer = ir_arrays[i].GetBasePointer();
-    llvm::LoadInst* dyn_dim_size = b_.CreateLoad(source_buffer, "dyn_dim_size");
+    llvm::LoadInst* dyn_dim_size = Load(source_buffer, "dyn_dim_size");
     dynamic_dims.push_back(dyn_dim_size);
   }
 
@@ -3829,9 +3830,9 @@ void IrEmitterUnnested::EmitTileElementForFusion(
         // spaces are inferred (which is pretty late in the pipeline), so
         // even if we had address-space-based AA in LLVM, it wouldn't help
         // us much here.
-        return b_.CreateLoad(thread_id_info.GEPIntoSharedMemory(
-                                 &b_, param_tile_buffer, {x_loc, y_loc}),
-                             "tiled_buffer");
+        return Load(thread_id_info.GEPIntoSharedMemory(&b_, param_tile_buffer,
+                                                       {x_loc, y_loc}),
+                    "tiled_buffer");
       };
     } else {
       auto array = operand_arrays[i];
@@ -3903,9 +3904,8 @@ ReductionCodegenState IrEmitterUnnested::GenerateReductionCodegenState(
                                        .ValueOrDie();
 
       for (int i = 0; i < num_partial_results; ++i) {
-        b_.CreateStore(
-            init_ir_value,
-            b_.CreateInBoundsGEP(partial_result_address, {b_.getInt32(i)}));
+        b_.CreateStore(init_ir_value,
+                       InBoundsGEP(partial_result_address, {b_.getInt32(i)}));
       }
 
       const TilingScheme& tiling_scheme =
@@ -3983,7 +3983,8 @@ void IrEmitterUnnested::EmitFullWarpShuffleDownLoopForReduce(
 
       llvm::Value* partial_result_address = partial_result_addresses[oidx];
       llvm::Value* partial_result =
-          b_.CreateLoad(convert_pointer_for_shuffle(partial_result_address),
+          b_.CreateLoad(shuffled_value_type,
+                        convert_pointer_for_shuffle(partial_result_address),
                         "partial_reduction_result");
       b_.CreateStore(
           EmitFullWarpShuffleDown(partial_result, b_.getInt32(distance), &b_),
@@ -4138,9 +4139,9 @@ void IrEmitterUnnested::EmitReductionOutputForRowReduction(
   for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
     const ReductionCodegenState::ReductionCalculationState& state =
         reduction_codegen_state.GetCalculationStateFor(reduction, output_idx);
-    current_outputs.push_back(
-        b_.CreateInBoundsGEP(state.partial_result_address,
-                             {constant(partial_result_idx)}, "current_output"));
+    current_outputs.push_back(InBoundsGEP(state.partial_result_address,
+                                          {constant(partial_result_idx)},
+                                          "current_output"));
   }
 
   EmitFullWarpShuffleDownLoopForReduce(reducer, current_outputs,
@@ -4156,7 +4157,7 @@ void IrEmitterUnnested::EmitReductionOutputForRowReduction(
           reduction_codegen_state.GetCalculationStateFor(reduction, oidx);
       llvm::Value* shmem_output_addr = thread_id_info.GEPIntoSharedMemory(
           &b_, state.shared_cache, {constant(partial_result_idx), warp_id});
-      b_.CreateStore(b_.CreateLoad(current_outputs[oidx]), shmem_output_addr);
+      Store(Load(current_outputs[oidx]), shmem_output_addr);
     }
   });
 
@@ -4201,8 +4202,7 @@ void IrEmitterUnnested::EmitReductionOutputForRowReduction(
             tiling_kernel_info, output_arrays, reduction, oidx);
 
         if (reduction_codegen_state.IsRaceFree()) {
-          b_.CreateStore(b_.CreateLoad(selected_values[oidx], "output"),
-                         output_address);
+          Store(Load(selected_values[oidx], "output"), output_address);
         } else {
           CHECK_EQ(num_outputs, 1);
           TF_CHECK_OK(EmitAtomicOperationForNestedComputation(
@@ -4242,10 +4242,10 @@ void IrEmitterUnnested::EmitReductionOutputForColumnReduction(
          thread_id_info.thread_id_y},
         "shmem_output_address");
     llvm::Value* current_output =
-        b_.CreateInBoundsGEP(state.partial_result_address,
-                             {constant(partial_result_idx)}, "current_output");
+        InBoundsGEP(state.partial_result_address,
+                    {constant(partial_result_idx)}, "current_output");
 
-    llvm::Value* current_output_value = b_.CreateLoad(current_output);
+    llvm::Value* current_output_value = Load(current_output);
     b_.CreateStore(current_output_value, shmem_output_addr);
   }
 
@@ -4285,9 +4285,8 @@ void IrEmitterUnnested::EmitReductionOutputForColumnReduction(
                  partial_result_idx, index_ty, reduction_codegen_state,
                  tiling_kernel_info, output_arrays, reduction, oidx);
              if (reduction_codegen_state.IsRaceFree()) {
-               b_.CreateStore(
-                   b_.CreateLoad(shmem_transposed_addrs[oidx], "output_value"),
-                   output_address);
+               Store(Load(shmem_transposed_addrs[oidx], "output_value"),
+                     output_address);
              } else {
                CHECK_EQ(num_outputs, 1);
                TF_CHECK_OK(EmitAtomicOperationForNestedComputation(
@@ -4956,7 +4955,8 @@ llvm::Value* IrEmitterUnnested::ThreadIdInfo::GEPIntoSharedMemory(
   idxs_scaled.push_back(scaling);
   idxs_scaled.insert(idxs_scaled.end(), idx_major_to_minor.begin(),
                      idx_major_to_minor.end());
-  llvm::Value* gep = b->CreateInBoundsGEP(shared, idxs_scaled, name);
+  llvm::Value* gep = b->CreateInBoundsGEP(
+      shared->getType()->getPointerElementType(), shared, idxs_scaled, name);
 
   // __shared__ memory uses a different address space, so we cast it to
   // global address space before writing or reading.
@@ -5107,8 +5107,8 @@ void IrEmitterUnnested::GenerateElementForReducer(
     llvm::Value* const input_ir_value = *state.input_gen(
         num_partial_results > 1 ? index_without_linear : input_index);
     b_.CreateStore(input_ir_value, input_address);
-    llvm::Value* partial_result_address = b_.CreateInBoundsGEP(
-        partial_reduction_result_address, {partial_result_index});
+    llvm::Value* partial_result_address =
+        InBoundsGEP(partial_reduction_result_address, {partial_result_index});
     reduction_accumulators.push_back(partial_result_address);
     reduction_input_value.push_back(input_address);
   }
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_utils.cc b/tensorflow/compiler/xla/service/gpu/nccl_utils.cc
index 14aede10e571bf..776e02ff3d9608 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_utils.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 
 namespace xla {
@@ -272,6 +273,31 @@ class NcclCliqueRendezvous
   absl::BlockingCounter* counter_;
 };
 
+// Periodically checks all NCCL communicators for asynchronous errors.
+// If an asynchronous error is observed, the communicator is aborted and an
+// error message logged.
+void CheckNcclAsyncErrors() {
+  while (true) {
+    absl::SleepFor(absl::Seconds(30));
+
+    NcclCliqueCache().ForEach([](const auto&, const NcclClique& clique) {
+      for (const auto& it : clique.GetComms()) {
+        ncclComm_t comm = it.second.get();
+        Status status = [comm] {
+          ncclResult_t async_err;
+          XLA_CUDA_RETURN_IF_ERROR(ncclCommGetAsyncError(comm, &async_err));
+          if (async_err != ncclSuccess) {
+            LOG(ERROR) << "Async NCCL error. Aborting communicator: " << comm;
+            XLA_CUDA_RETURN_IF_ERROR(ncclCommAbort(comm));
+          }
+          return XLA_CUDA_STATUS(async_err);
+        }();
+        if (!status.ok()) LOG(ERROR) << status.ToString();
+      }
+    });
+  }
+}
+
 }  // namespace
 
 StatusOr<std::vector<LocalParticipant>> GetLocalParticipants(
@@ -360,6 +386,13 @@ StatusOr<LockedNcclClique> AcquireNcclClique(
     const NcclCliqueParticipantData& participant,
     const std::vector<LocalParticipant>& local_participants,
     const NcclUniqueIdCallback* callback) {
+  // Launch a thread to check for async NCCL errors.
+  static auto check_async_error_thread =
+      tensorflow::Env::Default()->StartThread(tensorflow::ThreadOptions(),
+                                              "nccl_async_error_thread",
+                                              CheckNcclAsyncErrors);
+  (void)check_async_error_thread;  // Silence unused variable warning.
+
   VLOG(2) << "Rendezvous key: " << participant.rendezvous_key.ToString()
           << ", local participants: "
           << LocalParticipantsToString(local_participants);
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_utils.h b/tensorflow/compiler/xla/service/gpu/nccl_utils.h
index abe1471b2e0a0e..9c96d42ac0d7a0 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_utils.h
@@ -99,14 +99,16 @@ using NcclComm = std::unique_ptr<ncclComm, void (*)(ncclComm_t)>;
 // GPUs, you'll need a different clique.
 class NcclClique {
  public:
-  explicit NcclClique(
-      absl::flat_hash_map<int, NcclComm> comms_by_device_ordinal);
+  using CommMap = absl::flat_hash_map<int, NcclComm>;
+
+  explicit NcclClique(CommMap comms_by_device_ordinal);
 
   ncclComm_t GetCommForDeviceOrdinal(int device_ordinal) const;
+  const CommMap& GetComms() const { return comms_by_device_ordinal_; }
   absl::Mutex* mu() { return &mu_; }
 
  private:
-  absl::flat_hash_map<int, NcclComm> comms_by_device_ordinal_;
+  CommMap comms_by_device_ordinal_;
   absl::Mutex mu_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
index b49cc60ee2a36b..006330fec34b28 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
@@ -496,6 +496,8 @@ StatusOr<se::dnn::DataType> GetDNNDataTypeFromPrimitiveType(
       return se::dnn::ToDataType<int8>::value;
     case S32:
       return se::dnn::ToDataType<int32>::value;
+    case BF16:
+      return se::dnn::ToDataType<Eigen::bfloat16>::value;
     default:
       break;
   }
diff --git a/tensorflow/compiler/xla/service/gpu/xlir_ops.td b/tensorflow/compiler/xla/service/gpu/xlir_ops.td
index ddd85107b3901c..1f8ededa5c3c76 100644
--- a/tensorflow/compiler/xla/service/gpu/xlir_ops.td
+++ b/tensorflow/compiler/xla/service/gpu/xlir_ops.td
@@ -38,6 +38,8 @@ def XLIR_Dialect : Dialect {
   }];
 
   let cppNamespace = "xla::gpu";
+
+  let emitAccessorPrefix = kEmitAccessorPrefix_Raw;
 }
 
 // Base class for XLIR dialect ops.
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index f977471142c490..f6d8090b6197f0 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
@@ -287,6 +288,18 @@ class HloModuleConfig {
     return &memory_space_assignment_config_;
   }
 
+  int64_t GetAnalysisAllowance(absl::string_view pass_name) const {
+    auto it = analysis_allowance_map_.find(pass_name);
+    if (it == analysis_allowance_map_.end()) {
+      return -1;
+    }
+    return (*it).second;
+  }
+
+  void SetAnalysisAllowance(absl::string_view pass_name, int64_t allowance) {
+    analysis_allowance_map_[pass_name] = allowance;
+  }
+
  private:
   // If you add new members, be sure to update compilation_cache_key.
 
@@ -378,6 +391,10 @@ class HloModuleConfig {
   // sharding of operations when multiple computation would be chained and
   // merged together.
   bool allow_spmd_sharding_propagation_to_output_ = false;
+
+  // Each Hlo analysis is allowed at least a constant number of
+  // abstract cost units, before it is considered for early termination.
+  absl::flat_hash_map<absl::string_view, int64_t> analysis_allowance_map_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
index fda8479c993bc6..fe316ba77d41e5 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
@@ -482,11 +482,10 @@ llvm::Value* IrArray::EmitArrayElementAddress(const IrArray::Index& index,
 
   if (use_linear_index && index.LinearValidOnShape(shape_)) {
     llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
+    llvm::Type* type = PrimitiveTypeToIrType(shape_.element_type(), module);
     return b->CreateInBoundsGEP(
-        b->CreateBitCast(base_ptr_,
-                         PrimitiveTypeToIrType(shape_.element_type(), module)
-                             ->getPointerTo()),
-        {index.linear()}, llvm_ir::AsStringRef(name));
+        type, b->CreateBitCast(base_ptr_, type->getPointerTo()), index.linear(),
+        llvm_ir::AsStringRef(name));
   }
 
   std::vector<llvm::Value*> actual_index;
@@ -511,7 +510,8 @@ llvm::Value* IrArray::EmitArrayElementAddress(const IrArray::Index& index,
     int64_t dimension = LayoutUtil::Major(shape_.layout(), i);
     gep_indices.push_back(actual_index[dimension]);
   }
-  return b->CreateInBoundsGEP(base_ptr_, gep_indices,
+  return b->CreateInBoundsGEP(base_ptr_->getType()->getPointerElementType(),
+                              base_ptr_, gep_indices,
                               llvm_ir::AsStringRef(name));
 }
 
@@ -533,7 +533,9 @@ llvm::Value* IrArray::EmitReadArrayElement(const Index& index,
                                            bool use_linear_index) const {
   llvm::Value* element_address =
       EmitArrayElementAddress(index, b, name, use_linear_index);
-  llvm::LoadInst* load = b->CreateLoad(element_address, name.data());
+  llvm::LoadInst* load =
+      b->CreateLoad(element_address->getType()->getPointerElementType(),
+                    element_address, llvm_ir::AsStringRef(name));
   AnnotateLoadStoreInstructionWithMetadata(load);
   return load;
 }
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h b/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
index d5c433e661f978..4c3faab7fb908a 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_builder_mixin.h
@@ -40,8 +40,18 @@ class IrBuilderMixin {
   }
 
   template <class... Args>
-  llvm::LoadInst* AlignedLoad(Args&&... args) {
-    return mixin_builder()->CreateAlignedLoad(std::forward<Args>(args)...);
+  llvm::LoadInst* AlignedLoad(llvm::Type* type, Args&&... args) {
+    return mixin_builder()->CreateAlignedLoad(type,
+                                              std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::LoadInst* AlignedLoad(llvm::Value* value, Args&&... args) {
+    // LLVM has deprecated CreateAlignedLoad without a type argument. Provide it
+    // for convenience.
+    return mixin_builder()->CreateAlignedLoad(
+        value->getType()->getPointerElementType(), value,
+        std::forward<Args>(args)...);
   }
 
   template <class... Args>
@@ -117,7 +127,8 @@ class IrBuilderMixin {
 
   llvm::Value* GEP(llvm::Value* ptr, llvm::ArrayRef<llvm::Value*> idx_list,
                    const llvm::Twine& name = "") {
-    return mixin_builder()->CreateGEP(ptr, idx_list, name);
+    return mixin_builder()->CreateGEP(ptr->getType()->getPointerElementType(),
+                                      ptr, idx_list, name);
   }
 
   template <class... Args>
@@ -143,7 +154,8 @@ class IrBuilderMixin {
   llvm::Value* InBoundsGEP(llvm::Value* ptr,
                            llvm::ArrayRef<llvm::Value*> idx_list,
                            const llvm::Twine& name = "") {
-    return mixin_builder()->CreateInBoundsGEP(ptr, idx_list, name);
+    return mixin_builder()->CreateInBoundsGEP(
+        ptr->getType()->getPointerElementType(), ptr, idx_list, name);
   }
 
   llvm::Value* ExtractValue(llvm::Value* agg, llvm::ArrayRef<unsigned> idxs,
@@ -163,8 +175,17 @@ class IrBuilderMixin {
   }
 
   template <class... Args>
-  llvm::LoadInst* Load(Args&&... args) {
-    return mixin_builder()->CreateLoad(std::forward<Args>(args)...);
+  llvm::LoadInst* Load(llvm::Type* type, Args&&... args) {
+    return mixin_builder()->CreateLoad(type, std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  llvm::LoadInst* Load(llvm::Value* value, Args&&... args) {
+    // LLVM has deprecated CreateLoad without a type argument. Provide it for
+    // convenience.
+    return mixin_builder()->CreateLoad(
+        value->getType()->getPointerElementType(), value,
+        std::forward<Args>(args)...);
   }
 
   template <class... Args>
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
index 88cb83d91c9c71..d03527777cf6d9 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
@@ -110,7 +110,8 @@ void ForLoop::Emit(llvm::IRBuilder<>* b) {
   // Emit the loop conditional branch. Load and compare indvar with ending
   // index and jump to loop exit if equal. Jump to body otherwise.
   b->SetInsertPoint(header_bb_);
-  indvar_ = b->CreateLoad(indvar_address, GetQualifiedName("indvar"));
+  indvar_ = b->CreateLoad(start_index_->getType(), indvar_address,
+                          GetQualifiedName("indvar"));
   llvm::Value* exit_cond = b->CreateICmpUGE(indvar_, end_index_);
   b->CreateCondBr(/*Cond=*/exit_cond,
                   /*True=*/exit_bb_, /*False=*/body_bb_);
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index a1c046a3889f6c..71efee14b050db 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -609,23 +609,6 @@ void DumpIrIfEnabled(const HloModule& hlo_module,
   }
 }
 
-void DumpIrIfEnabled(mlir::ModuleOp mlir_module, int unique_id,
-                     const DebugOptions& debug_options) {
-  absl::optional<absl::string_view> module_name;
-  if (llvm::Optional<llvm::StringRef> mlir_module_name =
-          mlir_module.getName()) {
-    module_name = AsStringView(*mlir_module_name);
-  }
-  if (!DumpingEnabledForHloModule(module_name.value_or("<unnamed>"),
-                                  debug_options)) {
-    return;
-  }
-
-  DumpToFileInDirOrStdout(debug_options, unique_id, module_name.value_or(""),
-                          /*file_prefix=*/"",
-                          /*file_suffix=*/"lmhlo", DumpToString(mlir_module));
-}
-
 llvm::Function* CreateCpuFunction(llvm::FunctionType* function_type,
                                   llvm::GlobalValue::LinkageTypes linkage,
                                   const HloModuleConfig& module_config,
@@ -708,8 +691,8 @@ llvm::Value* RngGetAndUpdateState(uint64 delta, llvm::Module* module,
                                   llvm::IRBuilder<>* builder) {
   llvm::GlobalVariable* state_ptr =
       GetOrCreateVariableForRngState(module, builder);
-  llvm::LoadInst* state_value_old =
-      builder->CreateLoad(state_ptr, "load_state");
+  llvm::LoadInst* state_value_old = builder->CreateLoad(
+      state_ptr->getType()->getPointerElementType(), state_ptr, "load_state");
   llvm::Value* state_value_new = builder->CreateAdd(
       state_value_old,
       llvm::ConstantInt::get(state_value_old->getType(), delta));
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
index f649afcdf8c336..1d9c805bd29ff8 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.h
@@ -272,9 +272,6 @@ void DumpIrIfEnabled(const HloModule& hlo_module,
                      const llvm::Module& llvm_module, bool optimized,
                      absl::string_view filename_suffix = "");
 
-void DumpIrIfEnabled(mlir::ModuleOp mlir_module, int unique_id,
-                     const DebugOptions& debug_options);
-
 llvm::Function* CreateCpuFunction(llvm::FunctionType* function_type,
                                   llvm::GlobalValue::LinkageTypes linkage,
                                   const HloModuleConfig& module_config,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
index 5ac86413e386d0..538993e48f01a4 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
@@ -118,12 +118,12 @@ Status EmitCompareLoopBody(
       values_to_compare.push_back(element_address(i, current_keys_index));
     }
     llvm::Module* module = b->GetInsertBlock()->getParent()->getParent();
+    llvm::Type* pred_type = llvm_ir::PrimitiveTypeToIrType(PRED, module);
     llvm::Value* compare_return_buffer = llvm_ir::EmitAllocaAtFunctionEntry(
-        llvm_ir::PrimitiveTypeToIrType(PRED, module), "compare_return_buffer",
-        b);
+        pred_type, "compare_return_buffer", b);
     TF_RETURN_IF_ERROR(
         emit_compare_callback(values_to_compare, compare_return_buffer));
-    llvm::Value* result = b->CreateLoad(compare_return_buffer);
+    llvm::Value* result = b->CreateLoad(pred_type, compare_return_buffer);
 
     // Check if the 'compare' function returns true.
     llvm::Value* is_smaller_than =
@@ -132,8 +132,12 @@ Status EmitCompareLoopBody(
     ksl.If("is_smaller_than", is_smaller_than, [&]() {
       for (int64_t i = 0; i < num_values; ++i) {
         // Swap the values.
-        auto value1 = b->CreateLoad(values_to_compare[i * 2]);
-        auto value2 = b->CreateLoad(values_to_compare[i * 2 + 1]);
+        auto value1 = b->CreateLoad(
+            values_to_compare[i * 2]->getType()->getPointerElementType(),
+            values_to_compare[i * 2]);
+        auto value2 = b->CreateLoad(
+            values_to_compare[i * 2 + 1]->getType()->getPointerElementType(),
+            values_to_compare[i * 2 + 1]);
         write_element(i, current_keys_index, value1);
         write_element(i, compare_keys_index, value2);
       }
@@ -195,10 +199,12 @@ Status EmitTiledCompareLoop(
       IrArray::Index keys_index(keys_multi_index, params[i].GetShape(),
                                 tiled_keys_index.GetType());
       auto value = params[i].EmitReadArrayElement(keys_index, b);
-      b->CreateStore(value,
-                     b->CreateGEP(param_shmem_buffers[i],
-                                  {tiled_keys_index.GetConstantWithIndexType(0),
-                                   cache_index}));
+      b->CreateStore(
+          value,
+          b->CreateGEP(
+              param_shmem_buffers[i]->getType()->getPointerElementType(),
+              param_shmem_buffers[i],
+              {tiled_keys_index.GetConstantWithIndexType(0), cache_index}));
     });
   }
   // Wait until all reads have happened.
@@ -206,9 +212,10 @@ Status EmitTiledCompareLoop(
 
   // Now emit the bodies of the comparison loops.
   auto element_address = [&](int64_t operand, llvm::Value* index) {
-    auto shared_memory_address =
-        b->CreateGEP(param_shmem_buffers[operand],
-                     {tiled_keys_index.GetConstantWithIndexType(0), index});
+    auto shared_memory_address = b->CreateGEP(
+        param_shmem_buffers[operand]->getType()->getPointerElementType(),
+        param_shmem_buffers[operand],
+        {tiled_keys_index.GetConstantWithIndexType(0), index});
     auto ptr_type = shared_memory_address->getType();
     // We need a generic pointer with address space 0 instead of a pointer to
     // shared memory (address space 3) so that we can pass it to the comparison
@@ -222,8 +229,10 @@ Status EmitTiledCompareLoop(
                            llvm::Value* value) {
     b->CreateStore(
         value,
-        b->CreateGEP(param_shmem_buffers[operand],
-                     {tiled_keys_index.GetConstantWithIndexType(0), index}));
+        b->CreateGEP(
+            param_shmem_buffers[operand]->getType()->getPointerElementType(),
+            param_shmem_buffers[operand],
+            {tiled_keys_index.GetConstantWithIndexType(0), index}));
   };
   for (int64_t xor_mask : xor_masks) {
     // The index of the element pair to be compared within the tile stored in
@@ -273,9 +282,11 @@ Status EmitTiledCompareLoop(
       keys_multi_index[dimension_to_sort] = index;
       IrArray::Index keys_index(keys_multi_index, params[i].GetShape(),
                                 tiled_keys_index.GetType());
-      auto value = b->CreateLoad(b->CreateGEP(
+      auto gep = b->CreateGEP(
+          param_shmem_buffers[i]->getType()->getPointerElementType(),
           param_shmem_buffers[i],
-          {tiled_keys_index.GetConstantWithIndexType(0), cache_index}));
+          {tiled_keys_index.GetConstantWithIndexType(0), cache_index});
+      auto value = b->CreateLoad(gep->getType()->getPointerElementType(), gep);
       params[i].EmitWriteArrayElement(keys_index, value, b);
     });
   }
diff --git a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
index 11364d8ff85ea8..a4c297883579c7 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
@@ -41,12 +41,11 @@ void EmitTupleSelect(const IrArray& select, const IrArray& pred,
   llvm::Module* module = getModuleFromBuilder(b);
   CHECK(ShapeUtil::IsScalar(pred.GetShape()));
 
+  llvm::Type* pred_type = PrimitiveTypeToIrType(PRED, module);
   llvm::LoadInst* pred_value =
-      b->CreateLoad(pred.GetBasePointer(), "load_predicate_value");
+      b->CreateLoad(pred_type, pred.GetBasePointer(), "load_predicate_value");
   llvm::Value* pred_cond = b->CreateICmpNE(
-      pred_value,
-      llvm::ConstantInt::get(PrimitiveTypeToIrType(PRED, module), 0),
-      "boolean_predicate");
+      pred_value, llvm::ConstantInt::get(pred_type, 0), "boolean_predicate");
 
   VLOG(2) << "HandleSelect for tuple:";
   VLOG(2) << "  pred_value: " << DumpToString(*pred_value);
@@ -67,8 +66,9 @@ void EmitTuple(const IrArray& tuple, absl::Span<llvm::Value* const> operands,
     auto* cast =
         b->CreatePointerCast(operands[i], PrimitiveTypeToIrType(TUPLE, module));
     auto* store = b->CreateStore(
-        cast, b->CreateInBoundsGEP(tuple.GetBasePointer(),
-                                   {b->getInt64(0), b->getInt64(i)}));
+        cast, b->CreateInBoundsGEP(
+                  tuple.GetBasePointer()->getType()->getPointerElementType(),
+                  tuple.GetBasePointer(), {b->getInt64(0), b->getInt64(i)}));
     tuple.AnnotateLoadStoreInstructionWithMetadata(store);
   }
 }
@@ -114,8 +114,10 @@ llvm::Value* EmitGetTupleElement(const Shape& target_shape, int64_t index,
                                  llvm::IRBuilder<>* b) {
   llvm::Module* module = getModuleFromBuilder(b);
   llvm::Value* element_ptr =
-      b->CreateInBoundsGEP(operand, {b->getInt64(0), b->getInt64(index)});
-  llvm::LoadInst* src_buffer = b->CreateLoad(element_ptr);
+      b->CreateInBoundsGEP(operand->getType()->getPointerElementType(), operand,
+                           {b->getInt64(0), b->getInt64(index)});
+  llvm::LoadInst* src_buffer = b->CreateLoad(
+      element_ptr->getType()->getPointerElementType(), element_ptr);
 
   // Mark the loaded pointer as dereferenceable if we know its shape.
   if (!target_shape.IsOpaque()) {
diff --git a/tensorflow/compiler/xla/service/space_to_batch_converter.cc b/tensorflow/compiler/xla/service/space_to_batch_converter.cc
index 4aa90a80420d3a..a0e2b6ddc227a6 100644
--- a/tensorflow/compiler/xla/service/space_to_batch_converter.cc
+++ b/tensorflow/compiler/xla/service/space_to_batch_converter.cc
@@ -133,9 +133,25 @@ class ConvolutionVisitor {
       std::vector<int64_t>* spatial_dimensions_to_split,
       bool is_backprop = false, bool is_rhs = false);
 
+  // Performs the actual dimension splitting.
+  StatusOr<HloInstruction*> PerformSplitSpace(
+      HloInstruction* activations,
+      absl::Span<const int64_t> spatial_dimensions_to_split,
+      int64_t activations_batch_dim, int64_t spatial_split_size,
+      int64_t num_splits);
+
+  // Helper function that puts individually split dimensions together, and
+  // merges the batch(es).
+  // The input activations dimensions are ... B, B0, S0, B1, S1, ... Bn, Sn, ...
+  // The output dimensions will be ..., B, S0, S1,.. Sn, ...
+  StatusOr<HloInstruction*> TransposeAndMergeBatch(
+      HloInstruction* activations,
+      absl::Span<const int64_t> final_split_spatial_dim_positioning,
+      int64_t activations_batch_dim, int64_t old_batch_size);
+
   // Helper function for the SplitSpace function above. Handles padding and
   // reshaping to generate space-to-batched shape.
-  StatusOr<HloInstruction*> SplitSpaceHelper(
+  StatusOr<HloInstruction*> PadAndSplitSpace(
       HloInstruction* activations,
       absl::Span<const int64_t> spatial_dimensions_to_split,
       int64_t activations_batch_dim, int64_t high_padding, int64_t low_padding,
@@ -192,26 +208,26 @@ class ConvolutionVisitor {
       std::vector<int64_t>* spatial_dimensions_to_split,
       bool is_backprop = false, bool is_rhs = false);
 
-  // Increases the spatial dimension size in an already space-to-batched shape
+  // Decreases the spatial dimension size in an already space-to-batched shape
   // so that the new size is new_spatial_dim_size.
-  StatusOr<HloInstruction*> IncreaseSpatialSizeOnSpaceToBatchedShape(
+  StatusOr<HloInstruction*> ChangeSpatialSizeOnSpaceToBatchedShape(
       HloInstruction* activations, int64_t batch_dimension,
-      int64_t old_batch_size, int64_t spatial_dimension,
-      int64_t new_spatial_dim_size);
+      int64_t old_batch_size,
+      absl::Span<const int64_t> spatial_dimensions_to_split,
+      int64_t new_spatial_dim_size, bool increase_spatial_size = false);
 
-  // Decreases the spatial dimension size in an already space-to-batched shape
-  // so that the new size is new_spatial_dim_size.
-  StatusOr<HloInstruction*> DecreaseSpatialSizeOnSpaceToBatchedShape(
+  // Turns B, S0, S1, ..., Sn into B, B0, S0, B1, S1,... Bn, Sn.
+  StatusOr<HloInstruction*> SplitAndTransposeMergedBatch(
       HloInstruction* activations, int64_t batch_dimension,
-      int64_t old_batch_size, int64_t spatial_dimension,
-      int64_t new_spatial_dim_size);
+      int64_t old_batch_size, absl::Span<const int64_t> spatial_dimensions);
 
   // Function that converts spaced-to-batch shape back to the original.
   StatusOr<HloInstruction*> BatchToSpace(HloInstruction* old_instr);
 
   // Duplicates elements at boundaries.
   StatusOr<HloInstruction*> HaloDuplicateWithSlice(
-      HloInstruction* activations, int64_t spatial_dimension_to_split,
+      HloInstruction* activations,
+      absl::Span<const int64_t> spatial_dimensions_to_split,
       int64_t activations_batch_dim, int64_t low_padding, int64_t halo_size,
       HloInstruction* pad_val = nullptr);
 
@@ -345,9 +361,12 @@ ConvolutionVisitor::GetSpatialDimsToSplit(HloInstruction* old_operand) {
   auto permute_dims = instr_to_dim_permute_map_[new_operand];
   std::vector<int64_t> old_dims(ctrl_.count_of_dimensions_to_convert),
       new_dims(ctrl_.count_of_dimensions_to_convert);
-  for (int i = 0; i < ctrl_.count_of_dimensions_to_convert; ++i) {
-    old_dims[i] = dim_map_val[DimMapper(SpaceToBatchDimMap::kSpace0) + i];
-    new_dims[i] = DimLookUp(permute_dims, old_dims[i]);
+
+  old_dims[0] = dim_map_val[DimMapper(SpaceToBatchDimMap::kSpace0)];
+  new_dims[0] = DimLookUp(permute_dims, old_dims[0]);
+  for (int i = 1; i < ctrl_.count_of_dimensions_to_convert; ++i) {
+    old_dims[i] = old_dims[0] + i;
+    new_dims[i] = new_dims[0] + i;
   }
   return std::make_pair(old_dims, new_dims);
 }
@@ -491,34 +510,27 @@ bool ConvolutionVisitor::IsThisBackPropFilterConv(HloInstruction* convolution) {
 }
 
 StatusOr<HloInstruction*> ConvolutionVisitor::HaloDuplicateWithSlice(
-    HloInstruction* activations, int64_t spatial_dimension_to_split,
+    HloInstruction* activations,
+    absl::Span<const int64_t> spatial_dimensions_to_split,
     int64_t activations_batch_dim, int64_t low_padding, int64_t halo_size,
     HloInstruction* pad_val) {
+  const int64_t spatial_dim_count = spatial_dimensions_to_split.size();
+  const int64_t additional_batch_size = tensorflow::MathUtil::IPow<int64_t>(
+      ctrl_.number_of_splits, spatial_dim_count);
   const int64_t original_batch_size =
       activations->shape().dimensions(activations_batch_dim) /
-      ctrl_.number_of_splits;
+      additional_batch_size;
 
-  if (original_batch_size > 1) {
-    std::vector<int64_t> new_dimensions(
-        activations->shape().dimensions().begin(),
-        activations->shape().dimensions().end());
-    new_dimensions[activations_batch_dim] = ctrl_.number_of_splits;
-    new_dimensions.insert(new_dimensions.begin() + activations_batch_dim,
-                          original_batch_size);
-
-    // Reshape the output of the new conv into the old convolutions shape.
-    TF_ASSIGN_OR_RETURN(activations,
-                        MakeReshapeHlo(new_dimensions, activations));
+  const int64_t spatial_split_size =
+      activations->shape().dimensions(spatial_dimensions_to_split[0]);
+  const int64_t batch_size = ctrl_.number_of_splits;
 
-    spatial_dimension_to_split++;
-    activations_batch_dim++;
-  }
+  TF_ASSIGN_OR_RETURN(
+      activations, SplitAndTransposeMergedBatch(
+                       activations, activations_batch_dim, original_batch_size,
+                       spatial_dimensions_to_split));
 
   const int64_t rank = activations->shape().rank();
-  const int64_t spatial_split_size =
-      activations->shape().dimensions(spatial_dimension_to_split);
-  const int64_t batch_size =
-      activations->shape().dimensions(activations_batch_dim);
 
   VLOG(1) << "In HaloDuplicateWithSlice with activations "
           << activations->ToString() << " batch_size " << batch_size
@@ -527,103 +539,101 @@ StatusOr<HloInstruction*> ConvolutionVisitor::HaloDuplicateWithSlice(
 
   CHECK_LE(std::abs(halo_size - low_padding), spatial_split_size);
 
-  HloInstruction* first_slice = nullptr;
+  for (int64_t i = 0; i < spatial_dimensions_to_split.size(); ++i) {
+    int64_t spatial_dimension_to_split = activations_batch_dim + 2 * (i + 1);
+    int64_t remapped_batch_dimension = spatial_dimension_to_split - 1;
+    HloInstruction* first_slice = nullptr;
 
-  std::vector<int64_t> strides(rank, 1);
-  HloInstruction* padding =
-      pad_val == nullptr
-          ? computation_->AddInstruction(HloInstruction::CreateConstant(
-                LiteralUtil::Zero(activations->shape().element_type())))
-          : pad_val;
+    std::vector<int64_t> strides(rank, 1);
+    HloInstruction* padding =
+        pad_val == nullptr
+            ? computation_->AddInstruction(HloInstruction::CreateConstant(
+                  LiteralUtil::Zero(activations->shape().element_type())))
+            : pad_val;
 
-  if (low_padding > 0) {
-    std::vector<int64_t> start_indices(rank, 0),
-        end_indices(activations->shape().dimensions().begin(),
-                    activations->shape().dimensions().end());
-    start_indices[spatial_dimension_to_split] =
-        spatial_split_size - low_padding;
-    end_indices[activations_batch_dim] = batch_size - 1;
-    end_indices[spatial_dimension_to_split] = spatial_split_size;
-
-    TF_ASSIGN_OR_RETURN(first_slice, MakeSliceHlo(activations, start_indices,
-                                                  end_indices, strides));
-    VLOG(1) << "first slice " << first_slice->ToString();
-    PaddingConfig padding_config =
-        MakeNoPaddingConfig(first_slice->shape().dimensions_size());
-    padding_config.mutable_dimensions(activations_batch_dim)
-        ->set_edge_padding_low(1);
-
-    TF_ASSIGN_OR_RETURN(first_slice,
-                        MakePadHlo(first_slice, padding, padding_config));
-  }
-
-  HloInstruction* halo_region = nullptr;
-  if (halo_size - low_padding > 0) {
-    std::vector<int64_t> start_indices_halo(rank, 0),
-        end_indices_halo(activations->shape().dimensions().begin(),
-                         activations->shape().dimensions().end());
-
-    start_indices_halo[activations_batch_dim] = 1;
-    end_indices_halo[spatial_dimension_to_split] = halo_size - low_padding;
-
-    TF_ASSIGN_OR_RETURN(halo_region,
-                        MakeSliceHlo(activations, start_indices_halo,
-                                     end_indices_halo, strides));
-    VLOG(1) << "halo_region " << halo_region->ToString();
-    PaddingConfig padding_config_halo =
-        MakeNoPaddingConfig(halo_region->shape().dimensions_size());
-    padding_config_halo.mutable_dimensions(activations_batch_dim)
-        ->set_edge_padding_high(1);
-    TF_ASSIGN_OR_RETURN(halo_region,
-                        MakePadHlo(halo_region, padding, padding_config_halo));
-  }
-
-  if (halo_size == 0 && low_padding != 0) {
-    std::vector<int64_t> start_indices_activations_cut(rank, 0),
-        end_indices_activations_cut(activations->shape().dimensions().begin(),
-                                    activations->shape().dimensions().end());
-    // When no halo is needed, we must slice out activations.
     if (low_padding > 0) {
-      end_indices_activations_cut[spatial_dimension_to_split] =
+      std::vector<int64_t> start_indices(rank, 0),
+          end_indices(activations->shape().dimensions().begin(),
+                      activations->shape().dimensions().end());
+      start_indices[spatial_dimension_to_split] =
           spatial_split_size - low_padding;
-    } else {
-      start_indices_activations_cut[spatial_dimension_to_split] =
-          0 - low_padding;
-      end_indices_activations_cut[spatial_dimension_to_split] =
-          spatial_split_size;
+      end_indices[remapped_batch_dimension] = batch_size - 1;
+      end_indices[spatial_dimension_to_split] = spatial_split_size;
+
+      TF_ASSIGN_OR_RETURN(first_slice, MakeSliceHlo(activations, start_indices,
+                                                    end_indices, strides));
+      VLOG(1) << "first slice " << first_slice->ToString();
+
+      PaddingConfig padding_config =
+          MakeNoPaddingConfig(first_slice->shape().dimensions_size());
+      padding_config.mutable_dimensions(remapped_batch_dimension)
+          ->set_edge_padding_low(1);
+
+      TF_ASSIGN_OR_RETURN(first_slice,
+                          MakePadHlo(first_slice, padding, padding_config));
+    }
+
+    HloInstruction* halo_region = nullptr;
+    if (halo_size - low_padding > 0) {
+      std::vector<int64_t> start_indices_halo(rank, 0),
+          end_indices_halo(activations->shape().dimensions().begin(),
+                           activations->shape().dimensions().end());
+
+      start_indices_halo[remapped_batch_dimension] = 1;
+      end_indices_halo[spatial_dimension_to_split] = halo_size - low_padding;
+
+      TF_ASSIGN_OR_RETURN(halo_region,
+                          MakeSliceHlo(activations, start_indices_halo,
+                                       end_indices_halo, strides));
+      VLOG(1) << "halo_region " << halo_region->ToString();
+      PaddingConfig padding_config_halo =
+          MakeNoPaddingConfig(halo_region->shape().dimensions_size());
+      padding_config_halo.mutable_dimensions(remapped_batch_dimension)
+          ->set_edge_padding_high(1);
+      TF_ASSIGN_OR_RETURN(
+          halo_region, MakePadHlo(halo_region, padding, padding_config_halo));
     }
 
-    TF_ASSIGN_OR_RETURN(activations,
-                        MakeSliceHlo(activations, start_indices_activations_cut,
-                                     end_indices_activations_cut, strides));
-  }
-
-  if (first_slice != nullptr) {
-    TF_ASSIGN_OR_RETURN(activations, MakeConcatHlo({first_slice, activations},
-                                                   spatial_dimension_to_split));
-  }
-
-  if (halo_region != nullptr) {
-    TF_ASSIGN_OR_RETURN(activations, MakeConcatHlo({activations, halo_region},
-                                                   spatial_dimension_to_split));
-  }
+    if (halo_size == 0 && low_padding != 0) {
+      std::vector<int64_t> start_indices_activations_cut(rank, 0),
+          end_indices_activations_cut(activations->shape().dimensions().begin(),
+                                      activations->shape().dimensions().end());
+      // When no halo is needed, we must slice out activations.
+      if (low_padding > 0) {
+        end_indices_activations_cut[spatial_dimension_to_split] =
+            spatial_split_size - low_padding;
+      } else {
+        start_indices_activations_cut[spatial_dimension_to_split] =
+            0 - low_padding;
+        end_indices_activations_cut[spatial_dimension_to_split] =
+            spatial_split_size;
+      }
 
-  if (original_batch_size > 1) {
-    std::vector<int64_t> new_dimensions(
-        activations->shape().dimensions().begin(),
-        activations->shape().dimensions().end());
-    new_dimensions[activations_batch_dim] =
-        original_batch_size * ctrl_.number_of_splits;
-    new_dimensions.erase(new_dimensions.begin() + activations_batch_dim - 1);
+      TF_ASSIGN_OR_RETURN(
+          activations, MakeSliceHlo(activations, start_indices_activations_cut,
+                                    end_indices_activations_cut, strides));
+    }
 
-    // Reshape the output of the new conv into the old convolutions shape.
-    TF_ASSIGN_OR_RETURN(activations,
-                        MakeReshapeHlo(new_dimensions, activations));
+    if (first_slice != nullptr) {
+      TF_ASSIGN_OR_RETURN(activations,
+                          MakeConcatHlo({first_slice, activations},
+                                        spatial_dimension_to_split));
+    }
 
-    spatial_dimension_to_split++;
-    activations_batch_dim++;
+    if (halo_region != nullptr) {
+      TF_ASSIGN_OR_RETURN(activations,
+                          MakeConcatHlo({activations, halo_region},
+                                        spatial_dimension_to_split));
+    }
   }
 
+  TF_ASSIGN_OR_RETURN(
+      activations,
+      TransposeAndMergeBatch(
+          activations,
+          /*final_split_spatial_dim_positioning=*/spatial_dimensions_to_split,
+          activations_batch_dim, original_batch_size));
+
   VLOG(1) << "HaloDuplicated activations " << activations->ToString();
   return activations;
 }
@@ -733,110 +743,140 @@ ConvolutionVisitor::BringSpaceNextToBatch(
   return SpaceNextToBatchDetails{activations, transpose_dims};
 }
 
-StatusOr<HloInstruction*>
-ConvolutionVisitor::IncreaseSpatialSizeOnSpaceToBatchedShape(
+StatusOr<HloInstruction*> ConvolutionVisitor::SplitAndTransposeMergedBatch(
     HloInstruction* activations, int64_t batch_dimension,
-    int64_t old_batch_size, int64_t spatial_dimension,
-    int64_t new_spatial_dim_size) {
-  CHECK_EQ(batch_dimension + 1, spatial_dimension);
+    int64_t old_batch_size, absl::Span<const int64_t> spatial_dimensions) {
+  CHECK_EQ(batch_dimension + 1, spatial_dimensions[0]);
   std::vector<int64_t> new_dimensions(activations->shape().dimensions().begin(),
                                       activations->shape().dimensions().end());
 
   const int64_t new_batch_size =
       activations->shape().dimensions(batch_dimension);
-  int64_t spatial_dim_size = activations->shape().dimensions(spatial_dimension);
-  const int64_t reshaped_space_size =
-      spatial_dim_size * new_batch_size / old_batch_size;
 
-  VLOG(3) << "Increasing the spatial size while propagating new_batch_size "
+  VLOG(3) << "Decreasing the spatial size while propagating new_batch_size "
           << new_batch_size << " old_batch_size " << old_batch_size;
-  new_dimensions[spatial_dimension] = reshaped_space_size;
+
   new_dimensions[batch_dimension] = old_batch_size;
 
+  const int64_t spatial_dim_count = spatial_dimensions.size();
+  // Create additional batch dimensions.
+  for (int64_t i = 0; i < spatial_dim_count; ++i) {
+    new_dimensions.insert(new_dimensions.begin() + spatial_dimensions[0],
+                          ctrl_.number_of_splits);
+  }
+
   // Reshape the output of the new conv into the old convolutions shape.
-  TF_ASSIGN_OR_RETURN(HloInstruction * reshaped_activations,
+  TF_ASSIGN_OR_RETURN(HloInstruction * batch_split_activations,
                       MakeReshapeHlo(new_dimensions, activations));
 
-  VLOG(3) << "First reshape done";
-  PaddingConfig padding_config =
-      MakeNoPaddingConfig(reshaped_activations->shape().dimensions_size());
-  padding_config.mutable_dimensions(spatial_dimension)
-      ->set_edge_padding_high(new_spatial_dim_size * new_batch_size /
-                                  old_batch_size -
-                              reshaped_space_size);
-  padding_config.mutable_dimensions(spatial_dimension)->set_edge_padding_low(0);
-  HloInstruction* padding =
-      computation_->AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::Zero(reshaped_activations->shape().element_type())));
-
-  TF_ASSIGN_OR_RETURN(
-      reshaped_activations,
-      MakePadHlo(reshaped_activations, padding, padding_config));
-
-  std::vector<int64_t> reshape_back_dims(
-      reshaped_activations->shape().dimensions().begin(),
-      reshaped_activations->shape().dimensions().end());
-
-  reshape_back_dims[spatial_dimension] = new_spatial_dim_size;
-  reshape_back_dims[batch_dimension] = new_batch_size;
+  if (spatial_dim_count > 1) {
+    // Transpose such that we get // B, B0, S0, B1, S1,...
+    std::vector<int64_t> transpose_dims(new_dimensions.size());
+    absl::c_iota(transpose_dims, 0);
+    // Transpose such that we get B, B0, S0, B1, S1,...
+    std::vector<int64_t> trans_dims(new_dimensions.size());
+    absl::c_iota(trans_dims, 0);
 
-  TF_ASSIGN_OR_RETURN(HloInstruction * activations_new,
-                      MakeReshapeHlo(reshape_back_dims, reshaped_activations));
+    int64_t start_batch_dim_position = batch_dimension + 1;
+    int64_t start_space_dim_position = batch_dimension + 2;
 
-  VLOG(3) << "Size increased activations " << activations_new->ToString();
+    for (int i = 0; i < spatial_dim_count; ++i) {
+      transpose_dims[start_batch_dim_position + 2 * i] =
+          batch_dimension + spatial_dim_count - i;
+      transpose_dims[start_space_dim_position + 2 * i] =
+          batch_dimension + spatial_dim_count + 1 + i;
+    }
 
-  return activations_new;
+    TF_ASSIGN_OR_RETURN(
+        batch_split_activations,
+        MakeTransposeHlo(batch_split_activations, transpose_dims));
+  }
+  return batch_split_activations;
 }
 
 StatusOr<HloInstruction*>
-ConvolutionVisitor::DecreaseSpatialSizeOnSpaceToBatchedShape(
+ConvolutionVisitor::ChangeSpatialSizeOnSpaceToBatchedShape(
     HloInstruction* activations, int64_t batch_dimension,
-    int64_t old_batch_size, int64_t spatial_dimension,
-    int64_t new_spatial_dim_size) {
-  CHECK_EQ(batch_dimension + 1, spatial_dimension);
+    int64_t old_batch_size, absl::Span<const int64_t> spatial_dimensions,
+    int64_t new_spatial_dim_size, bool increase_spatial_size) {
+  CHECK_EQ(batch_dimension + 1, spatial_dimensions[0]);
   std::vector<int64_t> new_dimensions(activations->shape().dimensions().begin(),
                                       activations->shape().dimensions().end());
 
-  const int64_t new_batch_size =
-      activations->shape().dimensions(batch_dimension);
-  int64_t spatial_dim_size = activations->shape().dimensions(spatial_dimension);
-  const int64_t reshaped_space_size =
-      spatial_dim_size * new_batch_size / old_batch_size;
-
-  VLOG(3) << "Decreasing the spatial size while propagating new_batch_size "
-          << new_batch_size << " old_batch_size " << old_batch_size;
-  new_dimensions[spatial_dimension] = reshaped_space_size;
-  new_dimensions[batch_dimension] = old_batch_size;
+  const int64_t spatial_dim_count = spatial_dimensions.size();
+  const int64_t spatial_dim_size =
+      activations->shape().dimensions(spatial_dimensions[0]);
+  const int64_t reshaped_space_size = spatial_dim_size * ctrl_.number_of_splits;
 
   // Reshape the output of the new conv into the old convolutions shape.
-  TF_ASSIGN_OR_RETURN(HloInstruction * reshaped_activations,
-                      MakeReshapeHlo(new_dimensions, activations));
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * batch_split_activations,
+      SplitAndTransposeMergedBatch(activations, batch_dimension, old_batch_size,
+                                   spatial_dimensions));
+
+  // Now merge the individual (split) batch and space dimensions.
+  std::vector<int64_t> batch_space_collapse_reshape_dims(
+      batch_split_activations->shape().dimensions().begin(),
+      batch_split_activations->shape().dimensions().end());
+
+  batch_space_collapse_reshape_dims.erase(
+      batch_space_collapse_reshape_dims.begin() + spatial_dimensions[0],
+      batch_space_collapse_reshape_dims.begin() + spatial_dimensions[0] +
+          spatial_dim_count);
+
+  for (auto spatial_dimension : spatial_dimensions) {
+    batch_space_collapse_reshape_dims[spatial_dimension] = reshaped_space_size;
+  }
+
+  TF_ASSIGN_OR_RETURN(HloInstruction * batch_space_collapsed_reshape,
+                      MakeReshapeHlo(batch_space_collapse_reshape_dims,
+                                     batch_split_activations));
 
   VLOG(3) << "First reshape done";
 
   const int64_t rank = activations->shape().rank();
 
-  std::vector<int64_t> start_indices(rank, 0),
-      end_indices(reshaped_activations->shape().dimensions().begin(),
-                  reshaped_activations->shape().dimensions().end()),
-      strides(rank, 1);
-  end_indices[spatial_dimension] =
-      new_spatial_dim_size * (new_batch_size / old_batch_size);
-
-  // This is the slice from halo padding.
-  TF_ASSIGN_OR_RETURN(
-      reshaped_activations,
-      MakeSliceHlo(reshaped_activations, start_indices, end_indices, strides));
+  // If spatial size is increased, we add padding. If it has shrunk, we slice
+  // out the padding that was added before.
+  if (increase_spatial_size) {
+    PaddingConfig padding_config = MakeNoPaddingConfig(
+        batch_space_collapsed_reshape->shape().dimensions_size());
+    for (auto spatial_dimension : spatial_dimensions) {
+      padding_config.mutable_dimensions(spatial_dimension)
+          ->set_edge_padding_high(new_spatial_dim_size *
+                                      ctrl_.number_of_splits -
+                                  reshaped_space_size);
+      padding_config.mutable_dimensions(spatial_dimension)
+          ->set_edge_padding_low(0);
+    }
+    HloInstruction* padding = computation_->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::Zero(
+            batch_space_collapsed_reshape->shape().element_type())));
 
-  std::vector<int64_t> reshape_back_dims(
-      reshaped_activations->shape().dimensions().begin(),
-      reshaped_activations->shape().dimensions().end());
+    TF_ASSIGN_OR_RETURN(
+        batch_space_collapsed_reshape,
+        MakePadHlo(batch_space_collapsed_reshape, padding, padding_config));
+  } else {
+    std::vector<int64_t> start_indices(rank, 0),
+        end_indices(batch_space_collapsed_reshape->shape().dimensions().begin(),
+                    batch_space_collapsed_reshape->shape().dimensions().end()),
+        strides(rank, 1);
+    for (auto spatial_dimension : spatial_dimensions) {
+      end_indices[spatial_dimension] =
+          new_spatial_dim_size * ctrl_.number_of_splits;
+    }
 
-  reshape_back_dims[spatial_dimension] = new_spatial_dim_size;
-  reshape_back_dims[batch_dimension] = new_batch_size;
+    // This is the slice from halo padding.
+    TF_ASSIGN_OR_RETURN(batch_space_collapsed_reshape,
+                        MakeSliceHlo(batch_space_collapsed_reshape,
+                                     start_indices, end_indices, strides));
+  }
 
-  TF_ASSIGN_OR_RETURN(HloInstruction * activations_new,
-                      MakeReshapeHlo(reshape_back_dims, reshaped_activations));
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * activations_new,
+      PerformSplitSpace(batch_space_collapsed_reshape, spatial_dimensions,
+                        batch_dimension, new_spatial_dim_size,
+                        ctrl_.number_of_splits));
 
   VLOG(3) << "Size decreased activations " << activations_new->ToString();
 
@@ -2007,16 +2047,17 @@ StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
       if ((new_space_size - extra_space) * old_batch_size *
               ctrl_.number_of_splits >=
           old_batch_size * old_space_size) {
-        TF_ASSIGN_OR_RETURN(first_operand,
-                            DecreaseSpatialSizeOnSpaceToBatchedShape(
-                                first_operand, new_batch_dim, old_batch_size,
-                                new_space_dim, new_space_size - extra_space));
+        TF_ASSIGN_OR_RETURN(
+            first_operand, ChangeSpatialSizeOnSpaceToBatchedShape(
+                               first_operand, new_batch_dim, old_batch_size,
+                               new_spatial_dims, new_space_size - extra_space));
       } else {
         TF_ASSIGN_OR_RETURN(
             first_operand,
-            IncreaseSpatialSizeOnSpaceToBatchedShape(
-                first_operand, new_batch_dim, old_batch_size, new_space_dim,
-                new_space_size + stride - extra_space));
+            ChangeSpatialSizeOnSpaceToBatchedShape(
+                first_operand, new_batch_dim, old_batch_size, new_spatial_dims,
+                new_space_size + stride - extra_space,
+                /*increase_spatial_size*/ true));
       }
     }
     const int64_t window_size =
@@ -2029,7 +2070,7 @@ StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
     if (halo_size > 0) {
       TF_ASSIGN_OR_RETURN(
           first_operand,
-          HaloDuplicateWithSlice(first_operand, new_space_dim, new_batch_dim,
+          HaloDuplicateWithSlice(first_operand, new_spatial_dims, new_batch_dim,
                                  /*low_padding=*/0, halo_size, init_val));
     }
 
@@ -2242,7 +2283,6 @@ StatusOr<bool> ConvolutionVisitor::Propagate(HloInstruction* consumer,
   return true;
 }
 
-
 StatusOr<HloInstruction*> ConvolutionVisitor::SelectValidPortion(
     HloInstruction* new_instr, HloInstruction* old_instr,
     HloInstruction* select_val, int64_t new_batch_dim,
@@ -2280,10 +2320,11 @@ StatusOr<HloInstruction*> ConvolutionVisitor::SelectValidPortion(
     auto radix = ToMixedRadix(k, bounds);
 
     bool out_of_bounds = false;
+    int64_t batch_residue = 1;
     for (int i = 0; i < spatial_dim_count; ++i) {
-      const int64_t space_index = radix[spatial_dim_count + 1 + i];
-      const int64_t batch_index = radix[spatial_dim_count - i];
-
+      const int64_t space_index = radix[2 + i];
+      const int64_t batch_index = (radix[1] / batch_residue) % num_splits;
+      batch_residue *= num_splits;
       if (batch_index * new_space_size + space_index >= old_space_size) {
         out_of_bounds = true;
       }
@@ -2348,12 +2389,29 @@ StatusOr<HloInstruction*> ConvolutionVisitor::BatchToSpace(
   auto permute_dims = instr_to_dim_permute_map_[new_instr];
   const int64_t batch_dim = DimLookUp(permute_dims, old_batch_dim);
   const int64_t space_dim = DimLookUp(permute_dims, old_space_dim);
-  const int64_t batch_size = new_instr->shape().dimensions(batch_dim);
+
+  const int64_t spatial_dim_size = new_instr->shape().dimensions(space_dim);
+
+  std::vector<int64_t> split_spatial_dimensions(
+      ctrl_.count_of_dimensions_to_convert);
+  absl::c_iota(split_spatial_dimensions, space_dim);
+
+  TF_ASSIGN_OR_RETURN(new_instr, SplitAndTransposeMergedBatch(
+                                     new_instr, batch_dim, old_batch_size,
+                                     split_spatial_dimensions));
 
   std::vector<int64_t> new_dimensions(new_instr->shape().dimensions().begin(),
                                       new_instr->shape().dimensions().end());
-  new_dimensions[space_dim] *= (batch_size / old_batch_size);
-  new_dimensions[batch_dim] = old_batch_size;
+
+  new_dimensions.erase(new_dimensions.begin() + split_spatial_dimensions[0],
+                       new_dimensions.begin() + split_spatial_dimensions[0] +
+                           ctrl_.count_of_dimensions_to_convert);
+
+  for (auto spatial_dimension : split_spatial_dimensions) {
+    new_dimensions[spatial_dimension] =
+        spatial_dim_size * ctrl_.number_of_splits;
+  }
+
   // Reshape the output of the new conv into the old convolutions shape.
   TF_ASSIGN_OR_RETURN(HloInstruction * reshape,
                       MakeReshapeHlo(new_dimensions, new_instr));
@@ -2363,7 +2421,11 @@ StatusOr<HloInstruction*> ConvolutionVisitor::BatchToSpace(
   std::vector<int64_t> start_indices(rank, 0),
       end_indices(new_dimensions.begin(), new_dimensions.end()),
       strides(rank, 1);
-  end_indices[space_dim] = old_instr->shape().dimensions(old_space_dim);
+
+  for (auto spatial_dimension : split_spatial_dimensions) {
+    end_indices[spatial_dimension] =
+        old_instr->shape().dimensions(old_space_dim);
+  }
 
   // This slicing is getting rid of the padding we added to evenly divide space.
   TF_ASSIGN_OR_RETURN(
@@ -2566,9 +2628,10 @@ Status ConvolutionVisitor::PropagateOnConv(HloInstruction* convolution) {
   if (spatial_split_size > new_space_size) {
     TF_ASSIGN_OR_RETURN(
         activations_new,
-        IncreaseSpatialSizeOnSpaceToBatchedShape(
+        ChangeSpatialSizeOnSpaceToBatchedShape(
             activations_new, activations_batch_dim, old_batch_size,
-            new_spatial_dims[0], spatial_split_size));
+            new_spatial_dims, spatial_split_size,
+            /*increase_spatial_size*/ true));
 
   } else {
     // If the ideal spatial_split_size was smaller than the incoming spatial
@@ -2584,9 +2647,9 @@ Status ConvolutionVisitor::PropagateOnConv(HloInstruction* convolution) {
       if (new_space_size % c.stride != 0 || c.base_dilation_factor != 1) {
         TF_ASSIGN_OR_RETURN(
             activations_new,
-            DecreaseSpatialSizeOnSpaceToBatchedShape(
+            ChangeSpatialSizeOnSpaceToBatchedShape(
                 activations_new, activations_batch_dim, old_batch_size,
-                new_spatial_dims[0], spatial_split_size));
+                new_spatial_dims, spatial_split_size));
       } else {
         const int64_t additional_space_present = spatial_split_size % c.stride;
         spatial_split_size = new_space_size;
@@ -2603,7 +2666,7 @@ Status ConvolutionVisitor::PropagateOnConv(HloInstruction* convolution) {
   TF_ASSIGN_OR_RETURN(
       activations_new,
       HaloDuplicateWithSlice(
-          activations_new, new_spatial_dims[0], activations_batch_dim,
+          activations_new, new_spatial_dims, activations_batch_dim,
           /*low_padding=*/c.base_dilation_factor != 1 &&
                   c.inherent_low_padding != 0
               ? (c.inherent_low_padding == c.base_dilation_factor ? 1 : 0)
@@ -2641,10 +2704,13 @@ Status ConvolutionVisitor::PropagateOnConv(HloInstruction* convolution) {
   }
 
   auto new_window = convolution->window();
-  new_window.mutable_dimensions(get_first_chosen_spatial_dim(convolution))
-      ->set_padding_high(c.high_padding_for_conv);
-  new_window.mutable_dimensions(get_first_chosen_spatial_dim(convolution))
-      ->set_padding_low(c.low_padding_for_conv);
+  const int64_t first_dim = get_first_chosen_spatial_dim(convolution);
+  for (int i = 0; i < ctrl_.count_of_dimensions_to_convert; ++i) {
+    new_window.mutable_dimensions(first_dim + i)
+        ->set_padding_high(c.high_padding_for_conv);
+    new_window.mutable_dimensions(first_dim + i)
+        ->set_padding_low(c.low_padding_for_conv);
+  }
   TF_ASSIGN_OR_RETURN(
       HloInstruction * new_conv,
       MakeConvolveHlo(
@@ -2744,35 +2810,56 @@ Status ConvolutionVisitor::PropagateOnPad(HloInstruction* pad) {
   return Status::OK();
 }
 
-StatusOr<HloInstruction*> ConvolutionVisitor::SplitSpaceHelper(
+StatusOr<HloInstruction*> ConvolutionVisitor::TransposeAndMergeBatch(
     HloInstruction* activations,
-    absl::Span<const int64_t> spatial_dimensions_to_split,
-    int64_t activations_batch_dim, int64_t high_padding, int64_t low_padding,
-    int64_t spatial_split_size, int64_t num_splits) {
-  const int64_t old_batch_size =
-      activations->shape().dimensions(activations_batch_dim);
+    absl::Span<const int64_t> final_split_spatial_dim_positioning,
+    int64_t activations_batch_dim, int64_t old_batch_size) {
+  const int64_t spatial_dim_count = final_split_spatial_dim_positioning.size();
 
-  // Because we are splitting the spatial dimension, if convolution needed
-  // padding in the spatial dimension, we materialize it.
-  if (high_padding || low_padding) {
-    PaddingConfig padding_config =
-        MakeNoPaddingConfig(activations->shape().dimensions_size());
-    for (auto spatial_dimension_to_split : spatial_dimensions_to_split) {
-      padding_config.mutable_dimensions(spatial_dimension_to_split)
-          ->set_edge_padding_high(high_padding);
-      padding_config.mutable_dimensions(spatial_dimension_to_split)
-          ->set_edge_padding_low(low_padding);
+  if (final_split_spatial_dim_positioning.size() > 1) {
+    int64_t start_batch_dim_position = activations_batch_dim + 1;
+    int64_t start_space_dim_position =
+        start_batch_dim_position + spatial_dim_count;
+
+    std::vector<int64_t> trans_dims(activations->shape().dimensions_size());
+    absl::c_iota(trans_dims, 0);
+
+    for (int i = 0; i < spatial_dim_count; ++i) {
+      trans_dims[start_batch_dim_position + i] =
+          start_batch_dim_position + (spatial_dim_count - 1 - i) * 2;
+      trans_dims[start_space_dim_position + i] =
+          start_batch_dim_position + i * 2 + 1;
     }
-    HloInstruction* padding =
-        computation_->AddInstruction(HloInstruction::CreateConstant(
-            LiteralUtil::Zero(activations->shape().element_type())));
-    TF_ASSIGN_OR_RETURN(activations,
-                        MakePadHlo(activations, padding, padding_config));
+
+    TF_ASSIGN_OR_RETURN(activations, MakeTransposeHlo(activations, trans_dims));
   }
-  VLOG(1) << "Initial padded activations shape "
-          << activations->shape().ToString() << " old_batch_size "
-          << old_batch_size << " activations_batch_dim "
-          << activations_batch_dim;
+
+  std::vector<int64_t> batch_collapse_reshape_dims(
+      activations->shape().dimensions().begin(),
+      activations->shape().dimensions().end());
+
+  const int64_t collapsed_batch_size =
+      old_batch_size * tensorflow::MathUtil::IPow<int64_t>(
+                           ctrl_.number_of_splits, spatial_dim_count);
+
+  batch_collapse_reshape_dims.erase(
+      batch_collapse_reshape_dims.begin() + activations_batch_dim,
+      batch_collapse_reshape_dims.begin() + activations_batch_dim +
+          spatial_dim_count);
+  batch_collapse_reshape_dims[activations_batch_dim] = collapsed_batch_size;
+
+  TF_ASSIGN_OR_RETURN(HloInstruction * batch_collapsed_reshape,
+                      MakeReshapeHlo(batch_collapse_reshape_dims, activations));
+  return batch_collapsed_reshape;
+}
+
+StatusOr<HloInstruction*> ConvolutionVisitor::PerformSplitSpace(
+    HloInstruction* activations,
+    absl::Span<const int64_t> spatial_dimensions_to_split,
+    int64_t activations_batch_dim, int64_t spatial_split_size,
+    int64_t num_splits) {
+  const int64_t old_batch_size =
+      activations->shape().dimensions(activations_batch_dim);
 
   // Now we reorganize the activations. E.g. if the shape [B, SPACE] was [1, 16]
   // and 4 splits were needed, we first create [4, 4]. Next, to deal with halo
@@ -2798,7 +2885,7 @@ StatusOr<HloInstruction*> ConvolutionVisitor::SplitSpaceHelper(
   int counter = 0;
   for (auto spatial_dimension_to_split : spatial_dimensions_to_split) {
     reshape_dimensions.insert(
-        reshape_dimensions.begin() + (spatial_dimension_to_split + counter - 1),
+        reshape_dimensions.begin() + (spatial_dimension_to_split + counter),
         num_splits);
     counter++;
   }
@@ -2806,45 +2893,44 @@ StatusOr<HloInstruction*> ConvolutionVisitor::SplitSpaceHelper(
   TF_ASSIGN_OR_RETURN(HloInstruction * batch_increased_reshape,
                       MakeReshapeHlo(reshape_dimensions, activations));
 
-  const int64_t spatial_dim_count = spatial_dimensions_to_split.size();
-
-  if (spatial_dimensions_to_split.size() > 1) {
-    int64_t start_batch_dim_position = activations_batch_dim + 1;
-    int64_t start_space_dim_position =
-        start_batch_dim_position + spatial_dim_count;
+  return TransposeAndMergeBatch(
+      batch_increased_reshape,
+      /*final_split_spatial_dim_positioning=*/spatial_dimensions_to_split,
+      activations_batch_dim, old_batch_size);
+}
 
-    std::vector<int64_t> trans_dims(reshape_dimensions.size());
-    absl::c_iota(trans_dims, 0);
+StatusOr<HloInstruction*> ConvolutionVisitor::PadAndSplitSpace(
+    HloInstruction* activations,
+    absl::Span<const int64_t> spatial_dimensions_to_split,
+    int64_t activations_batch_dim, int64_t high_padding, int64_t low_padding,
+    int64_t spatial_split_size, int64_t num_splits) {
+  const int64_t old_batch_size =
+      activations->shape().dimensions(activations_batch_dim);
 
-    for (int i = 0; i < spatial_dim_count; ++i) {
-      trans_dims[start_batch_dim_position + i] =
-          start_batch_dim_position + i * 2;
-      trans_dims[start_space_dim_position + i] =
-          start_batch_dim_position + i * 2 + 1;
+  // Because we are splitting the spatial dimension, if convolution needed
+  // padding in the spatial dimension, we materialize it.
+  if (high_padding || low_padding) {
+    PaddingConfig padding_config =
+        MakeNoPaddingConfig(activations->shape().dimensions_size());
+    for (auto spatial_dimension_to_split : spatial_dimensions_to_split) {
+      padding_config.mutable_dimensions(spatial_dimension_to_split)
+          ->set_edge_padding_high(high_padding);
+      padding_config.mutable_dimensions(spatial_dimension_to_split)
+          ->set_edge_padding_low(low_padding);
     }
-
-    TF_ASSIGN_OR_RETURN(batch_increased_reshape,
-                        MakeTransposeHlo(batch_increased_reshape, trans_dims));
+    HloInstruction* padding =
+        computation_->AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::Zero(activations->shape().element_type())));
+    TF_ASSIGN_OR_RETURN(activations,
+                        MakePadHlo(activations, padding, padding_config));
   }
-
-  std::vector<int64_t> batch_collapse_reshape_dims(
-      batch_increased_reshape->shape().dimensions().begin(),
-      batch_increased_reshape->shape().dimensions().end());
-
-  const int64_t collapsed_batch_size =
-      old_batch_size *
-      tensorflow::MathUtil::IPow<int64_t>(num_splits, spatial_dim_count);
-
-  batch_collapse_reshape_dims.erase(
-      batch_collapse_reshape_dims.begin() + activations_batch_dim,
-      batch_collapse_reshape_dims.begin() + activations_batch_dim +
-          spatial_dim_count);
-  batch_collapse_reshape_dims[activations_batch_dim] = collapsed_batch_size;
-
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * batch_collapsed_reshape,
-      MakeReshapeHlo(batch_collapse_reshape_dims, batch_increased_reshape));
-  return batch_collapsed_reshape;
+  VLOG(1) << "Initial padded activations shape "
+          << activations->shape().ToString() << " old_batch_size "
+          << old_batch_size << " activations_batch_dim "
+          << activations_batch_dim;
+  return PerformSplitSpace(activations, spatial_dimensions_to_split,
+                           activations_batch_dim, spatial_split_size,
+                           num_splits);
 }
 
 StatusOr<std::pair<HloInstruction*, std::vector<int64_t>>>
@@ -2863,7 +2949,7 @@ ConvolutionVisitor::SplitSpace(
   std::vector<int64_t> transpose_dims = retval.transpose_dims;
   TF_ASSIGN_OR_RETURN(
       auto new_activations,
-      SplitSpaceHelper(activations, *spatial_dimensions_to_split,
+      PadAndSplitSpace(activations, *spatial_dimensions_to_split,
                        activations_batch_dim, high_padding, low_padding,
                        spatial_split_size, num_splits));
   return std::make_pair(new_activations, transpose_dims);
@@ -2899,7 +2985,7 @@ StatusOr<HloInstruction*> ConvolutionVisitor::PropagateOnConstant(
        old_batch_size * producer->shape().dimensions(old_space_dim)) /
       old_batch_size;
 
-  auto new_consumer = SplitSpaceHelper(
+  auto new_consumer = PadAndSplitSpace(
       consumer, new_spatial_dims, new_batch_dim, high_padding,
       /*low_padding=*/0, new_producer->shape().dimensions(new_space_dim),
       ctrl_.number_of_splits);
@@ -3143,27 +3229,22 @@ Status ConvolutionVisitor::PropagateOnBackpropFilterConv(
     new_split_dim_size = expected_split_dim_size;
     TF_ASSIGN_OR_RETURN(
         activations_new,
-        IncreaseSpatialSizeOnSpaceToBatchedShape(
+        ChangeSpatialSizeOnSpaceToBatchedShape(
             activations_new, activations_batch_dim, old_batch_size,
-            spatial_dimension_to_split, new_split_dim_size));
+            spatial_dimensions_to_split, new_split_dim_size, true));
   }
 
+  spatial_dimension_to_split = spatial_dimensions_to_split[0];
+
   auto select_val = computation_->AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::Zero(activations_new->shape().element_type())));
 
   if (!activations_locally_space_to_batched) {
-    std::vector<int64_t> new_split_spatial_dims(
-        ctrl_.dimension_from_end_to_convert);
-
-    // TODO(b/189500737) : Extend this once
-    // IncreaseSpatialSizeOnSpaceToBatchedShape returns all dimensions.
-    new_split_spatial_dims[0] = spatial_dimension_to_split;
-
     // Select activations correctly by masking additional space.
     TF_ASSIGN_OR_RETURN(
         activations_new,
         SelectValidPortion(activations_new, activations_old, select_val,
-                           activations_batch_dim, new_split_spatial_dims,
+                           activations_batch_dim, spatial_dimensions_to_split,
                            old_batch_dim, old_split_spatial_dims));
   }
   if (!kernel_locally_space_to_batched) {
@@ -3214,7 +3295,7 @@ Status ConvolutionVisitor::PropagateOnBackpropFilterConv(
     }
     TF_ASSIGN_OR_RETURN(
         HloInstruction * activations_slice,
-        HaloDuplicateWithSlice(activations_to_use, spatial_dimension_to_split,
+        HaloDuplicateWithSlice(activations_to_use, spatial_dimensions_to_split,
                                activations_batch_dim, /*low_padding=*/1,
                                /*halo_size=*/0));
     activations_chunks.push_back(activations_slice);
@@ -3247,7 +3328,7 @@ Status ConvolutionVisitor::PropagateOnBackpropFilterConv(
         TF_ASSIGN_OR_RETURN(
             activations_slice,
             HaloDuplicateWithSlice(
-                activations_to_use, spatial_dimension_to_split,
+                activations_to_use, spatial_dimensions_to_split,
                 activations_batch_dim,
                 /*low_padding=*/inherent_low_padding, /*halo_size=*/0));
       } else {
@@ -3256,11 +3337,11 @@ Status ConvolutionVisitor::PropagateOnBackpropFilterConv(
     } else {
       activations_to_use = activations_chunks.back();
 
-      TF_ASSIGN_OR_RETURN(
-          activations_slice,
-          HaloDuplicateWithSlice(activations_to_use, spatial_dimension_to_split,
-                                 activations_batch_dim, /*low_padding=*/-1,
-                                 /*halo_size=*/0));
+      TF_ASSIGN_OR_RETURN(activations_slice,
+                          HaloDuplicateWithSlice(
+                              activations_to_use, spatial_dimensions_to_split,
+                              activations_batch_dim, /*low_padding=*/-1,
+                              /*halo_size=*/0));
     }
 
     activations_chunks.push_back(activations_slice);
@@ -3283,7 +3364,7 @@ Status ConvolutionVisitor::PropagateOnBackpropFilterConv(
 
     TF_ASSIGN_OR_RETURN(
         HloInstruction * activations_slice,
-        HaloDuplicateWithSlice(activations_to_use, spatial_dimension_to_split,
+        HaloDuplicateWithSlice(activations_to_use, spatial_dimensions_to_split,
                                activations_batch_dim,
                                /*low_padding=*/-1, /*halo_size=*/0));
     activations_chunks.push_back(activations_slice);
@@ -3653,11 +3734,10 @@ Status ConvolutionVisitor::PerformSpaceToBatchOnConvolution(
 
   VLOG(1) << "First reshape done " << batch_increased_reshape->ToString();
 
-  int64_t spatial_dimension_to_split = spatial_dimensions_to_split[0];
   TF_ASSIGN_OR_RETURN(
       activations,
       HaloDuplicateWithSlice(
-          batch_increased_reshape, spatial_dimension_to_split,
+          batch_increased_reshape, spatial_dimensions_to_split,
           activations_batch_dim,
           /*low_padding=*/
           handle_low_pad_in_first_reshape ? 0 : low_pad_to_handle_base_dilation,
@@ -3698,10 +3778,13 @@ Status ConvolutionVisitor::PerformSpaceToBatchOnConvolution(
   VLOG(1) << "New dim numbers " << new_dim_numbers.DebugString()
           << " batch dim " << new_dim_numbers.input_batch_dimension();
   auto new_window = convolution->window();
-  new_window.mutable_dimensions(get_first_chosen_spatial_dim(convolution))
-      ->set_padding_high(c.high_padding_for_conv);
-  new_window.mutable_dimensions(get_first_chosen_spatial_dim(convolution))
-      ->set_padding_low(c.low_padding_for_conv);
+  const int64_t first_dim = get_first_chosen_spatial_dim(convolution);
+  for (int i = 0; i < ctrl_.count_of_dimensions_to_convert; ++i) {
+    new_window.mutable_dimensions(first_dim + i)
+        ->set_padding_high(c.high_padding_for_conv);
+    new_window.mutable_dimensions(first_dim + i)
+        ->set_padding_low(c.low_padding_for_conv);
+  }
   TF_ASSIGN_OR_RETURN(
       HloInstruction * new_conv,
       MakeConvolveHlo(
@@ -3719,14 +3802,13 @@ Status ConvolutionVisitor::PerformSpaceToBatchOnConvolution(
   VLOG(1) << "Space-to-batched convolution " << new_conv->ToString();
 
   std::vector<int64_t> new_output_split_spatial_dims(
-      ctrl_.dimension_from_end_to_convert),
-      old_output_split_spatial_dims(ctrl_.dimension_from_end_to_convert);
-  for (int i = 0; i < ctrl_.dimension_from_end_to_convert; ++i) {
-    old_output_split_spatial_dims[i] = dim_numbers.output_spatial_dimensions(
-        get_first_chosen_spatial_dim(convolution) + i);
+      ctrl_.count_of_dimensions_to_convert),
+      old_output_split_spatial_dims(ctrl_.count_of_dimensions_to_convert);
+  for (int i = 0; i < ctrl_.count_of_dimensions_to_convert; ++i) {
+    old_output_split_spatial_dims[i] =
+        dim_numbers.output_spatial_dimensions(first_dim + i);
     new_output_split_spatial_dims[i] =
-        new_dim_numbers.output_spatial_dimensions(
-            get_first_chosen_spatial_dim(convolution) + i);
+        new_dim_numbers.output_spatial_dimensions(first_dim + i);
   }
 
   const int64_t output_batch_dim = new_dim_numbers.output_batch_dimension();
diff --git a/tensorflow/compiler/xla/service/space_to_batch_converter_test.cc b/tensorflow/compiler/xla/service/space_to_batch_converter_test.cc
index 71966f8f5a032d..47b9b171c851f0 100644
--- a/tensorflow/compiler/xla/service/space_to_batch_converter_test.cc
+++ b/tensorflow/compiler/xla/service/space_to_batch_converter_test.cc
@@ -56,13 +56,15 @@ ENTRY computation {
   EXPECT_THAT(root->operand(0), op::Slice());
   auto reshape = root->operand(0)->operand(0);
   EXPECT_THAT(reshape, op::Reshape());
-  EXPECT_THAT(reshape->operand(0)->operand(1), op::Convolution());
-  const int64_t batch_dim = reshape->operand(0)
+  auto previous_reshape = reshape->operand(0);
+  EXPECT_THAT(previous_reshape, op::Reshape());
+  EXPECT_THAT(previous_reshape->operand(0)->operand(1), op::Convolution());
+  const int64_t batch_dim = previous_reshape->operand(0)
                                 ->operand(1)
                                 ->convolution_dimension_numbers()
                                 .output_batch_dimension();
   // Verify that the transform has increased the batch size.
-  EXPECT_GT(reshape->operand(0)->shape().dimensions(batch_dim), 1);
+  EXPECT_GT(previous_reshape->operand(0)->shape().dimensions(batch_dim), 1);
 }
 
 TEST_F(SpaceToBatchConverterTest, SimpleBatch1ConvXpose) {
@@ -91,9 +93,11 @@ ENTRY computation {
   EXPECT_THAT(root->operand(0), op::Slice());
   auto reshape = root->operand(0)->operand(0);
   EXPECT_THAT(reshape, op::Reshape());
+  auto previous_reshape = reshape->operand(0);
+  EXPECT_THAT(previous_reshape, op::Reshape());
   // This should be the original root transpose - which we handle transparently.
-  EXPECT_THAT(reshape->operand(0), op::Select());
-  EXPECT_THAT(reshape->operand(0)->operand(1), op::Convolution());
+  EXPECT_THAT(previous_reshape->operand(0), op::Select());
+  EXPECT_THAT(previous_reshape->operand(0)->operand(1), op::Convolution());
 }
 
 TEST_F(SpaceToBatchConverterTest, SimpleBatch1WithReduceWindow) {
@@ -195,13 +199,15 @@ TEST_F(SpaceToBatchConverterTest, Batch1WithStrideAndPad) {
   EXPECT_THAT(root->operand(0), op::Slice());
   auto reshape = root->operand(0)->operand(0);
   EXPECT_THAT(reshape, op::Reshape());
-  EXPECT_THAT(reshape->operand(0)->operand(1), op::Convolution());
-  const int64_t batch_dim = reshape->operand(0)
+  auto previous_reshape = reshape->operand(0);
+  EXPECT_THAT(previous_reshape, op::Reshape());
+  EXPECT_THAT(previous_reshape->operand(0)->operand(1), op::Convolution());
+  const int64_t batch_dim = previous_reshape->operand(0)
                                 ->operand(1)
                                 ->convolution_dimension_numbers()
                                 .output_batch_dimension();
 
-  EXPECT_GT(reshape->operand(0)->shape().dimensions(batch_dim), 4);
+  EXPECT_GT(previous_reshape->operand(0)->shape().dimensions(batch_dim), 4);
 }
 
 TEST_F(SpaceToBatchConverterTest, Batch1WithBaseDilation) {
@@ -230,13 +236,15 @@ ENTRY computation {
   EXPECT_THAT(root->operand(0), op::Slice());
   auto reshape = root->operand(0)->operand(0);
   EXPECT_THAT(reshape, op::Reshape());
-  EXPECT_THAT(reshape->operand(0)->operand(1), op::Convolution());
-  const int64_t batch_dim = reshape->operand(0)
+  auto previous_reshape = reshape->operand(0);
+  EXPECT_THAT(previous_reshape, op::Reshape());
+  EXPECT_THAT(previous_reshape->operand(0)->operand(1), op::Convolution());
+  const int64_t batch_dim = previous_reshape->operand(0)
                                 ->operand(1)
                                 ->convolution_dimension_numbers()
                                 .output_batch_dimension();
 
-  EXPECT_GT(reshape->operand(0)->shape().dimensions(batch_dim), 4);
+  EXPECT_GT(previous_reshape->operand(0)->shape().dimensions(batch_dim), 4);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
index e22324d4e2e2a9..b33c2c28c6d23e 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "tensorflow/compiler/xla/service/compile_time_cap.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/tuple_util.h"
 #include "tensorflow/compiler/xla/service/while_loop_analysis.h"
@@ -128,7 +129,7 @@ bool WhileLoopInvariantCodeMotion::NotWorthHoistingIndividually(
 
 StatusOr<bool>
 WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
-    HloInstruction* while_instr) {
+    HloInstruction* while_instr, BoundNonLinearCompilerAnalysis* allowance) {
   auto print_no_metadata = HloPrintOptions{}.set_print_metadata(false);
 
   if (!while_instr->shape().IsTuple()) {
@@ -205,6 +206,11 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
   std::vector<HloInstruction*> replacement_instructions;
 
   for (auto* instruction : while_body->MakeInstructionPostOrder()) {
+    allowance->DeductCost(1);
+    if (!allowance->ContinueAnalysis()) {
+      return false;
+    }
+
     if (instruction->HasSideEffect() ||
         instruction->opcode() == HloOpcode::kParameter ||
         !instruction->control_predecessors().empty() ||
@@ -216,7 +222,6 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
         instruction->opcode() != HloOpcode::kReshape) {
       continue;
     }
-
     // Constants don't inflate, so size inflation check doesn't make sense for
     // constants.
     if (hoist_size_inflation_ratio_ &&
@@ -323,6 +328,7 @@ StatusOr<bool> WhileLoopInvariantCodeMotion::Run(HloModule* module) {
                       return instr->opcode() == HloOpcode::kWhile;
                     });
   }
+  BoundNonLinearCompilerAnalysis allowance(module, name(), 10);
 
   for (HloInstruction* while_instr : while_instrs) {
     // Right now we only hoist computations from the while body, but
@@ -337,9 +343,12 @@ StatusOr<bool> WhileLoopInvariantCodeMotion::Run(HloModule* module) {
     // * We delete while loops that have a zero trip count, so this would have
     //   to be a while loop with a somewhat opaque condition expression.
 
+    if (!allowance.ContinueAnalysis()) {
+      break;
+    }
     TF_ASSIGN_OR_RETURN(
         bool result,
-        TryHoistingInvariantInstructionsFromWhileBody(while_instr));
+        TryHoistingInvariantInstructionsFromWhileBody(while_instr, &allowance));
     changed |= result;
   }
 
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
index 976ca3b8210bb5..317ebdae5729e7 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_INVARIANT_CODE_MOTION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_INVARIANT_CODE_MOTION_H_
 
+#include "tensorflow/compiler/xla/service/compile_time_cap.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -70,7 +71,7 @@ class WhileLoopInvariantCodeMotion : public HloModulePass {
  private:
   bool NotWorthHoistingIndividually(const HloInstruction& instruction);
   StatusOr<bool> TryHoistingInvariantInstructionsFromWhileBody(
-      HloInstruction* while_instr);
+      HloInstruction* while_instr, BoundNonLinearCompilerAnalysis* allowance);
 
   bool hoist_constants_;
   bool hoist_reshapes_;
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesAggregateStats.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesAggregateStats.pbtxt
index d5a5502565d50a..cc957ded22f027 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesAggregateStats.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesAggregateStats.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesAggregateStats"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "node_ids"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesBucketize.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesBucketize.pbtxt
index bfaf3d2ea5912b..af28071a7e12ac 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesBucketize.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesBucketize.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesBucketize"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "float_values"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestFeatureSplit.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestFeatureSplit.pbtxt
index 661394a1cc574e..fa32e29058b43e 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestFeatureSplit.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestFeatureSplit.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesCalculateBestFeatureSplit"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "node_id_range"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestFeatureSplitV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestFeatureSplitV2.pbtxt
index 84382d8a99ce99..51c639752fab0c 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestFeatureSplitV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestFeatureSplitV2.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesCalculateBestFeatureSplitV2"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "node_id_range"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
index ad273f7fca18b6..30b007c295b43e 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCalculateBestGainsPerFeature.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesCalculateBestGainsPerFeature"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "node_id_range"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCenterBias.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCenterBias.pbtxt
index b58b974eb4e43b..127df0513c62a0 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCenterBias.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCenterBias.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesCenterBias"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "tree_ensemble_handle"
@@ -38,4 +39,4 @@ Bool, whether to continue bias centering.
 END
   }
   summary: "Calculates the prior from the training data (the bias) and fills in the first node with the logits' prior. Returns a boolean indicating whether to continue centering."
-}
\ No newline at end of file
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCreateEnsemble.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCreateEnsemble.pbtxt
index aee73b910f0ae8..86d13b616d2a0a 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCreateEnsemble.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCreateEnsemble.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesCreateEnsemble"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "tree_ensemble_handle"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCreateQuantileStreamResource.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCreateQuantileStreamResource.pbtxt
index 20da1295f6a39a..1a457d29b0d7a9 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesCreateQuantileStreamResource.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesCreateQuantileStreamResource.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesCreateQuantileStreamResource"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "quantile_stream_resource_handle"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesDeserializeEnsemble.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesDeserializeEnsemble.pbtxt
index b1602ba045b95d..28fba0ed3b2cac 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesDeserializeEnsemble.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesDeserializeEnsemble.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesDeserializeEnsemble"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "tree_ensemble_handle"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesEnsembleResourceHandleOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesEnsembleResourceHandleOp.pbtxt
index 1bce5639a2049d..5ff4a9adc7faab 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesEnsembleResourceHandleOp.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesEnsembleResourceHandleOp.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesEnsembleResourceHandleOp"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   summary: "Creates a handle to a BoostedTreesEnsembleResource"
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesExampleDebugOutputs.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesExampleDebugOutputs.pbtxt
index 2f87b6f8f1e65f..42c443534608d2 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesExampleDebugOutputs.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesExampleDebugOutputs.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesExampleDebugOutputs"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "bucketized_features"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesFlushQuantileSummaries.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesFlushQuantileSummaries.pbtxt
index 87cbe1bc39c503..52626122a0ffa1 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesFlushQuantileSummaries.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesFlushQuantileSummaries.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesFlushQuantileSummaries"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "quantile_stream_resource_handle"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
index 4377125224979a..878083c7858d70 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesGetEnsembleStates.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesGetEnsembleStates"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "tree_ensemble_handle"
@@ -40,4 +41,4 @@ END
 
   }
   summary: "Retrieves the tree ensemble resource stamp token, number of trees and growing statistics."
-}
\ No newline at end of file
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt
index e7a3ca3d9fd051..9427b5b4df2cef 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeQuantileSummaries.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesMakeQuantileSummaries"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "float_values"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeStatsSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeStatsSummary.pbtxt
index dc0856c900d1b1..b954aaa05b4e08 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeStatsSummary.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesMakeStatsSummary.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesMakeStatsSummary"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "node_ids"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt
index 60ad9b4640f77a..2bb14b70d61931 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesPredict.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesPredict"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "bucketized_features"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceAddSummaries.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceAddSummaries.pbtxt
index bbeecbf32bd96c..544edba35b9705 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceAddSummaries.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceAddSummaries.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesQuantileStreamResourceAddSummaries"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "quantile_stream_resource_handle"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceDeserialize.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceDeserialize.pbtxt
index 7e61e5fa93aae4..15f0ce0dc2fe36 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceDeserialize.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceDeserialize.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesQuantileStreamResourceDeserialize"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "quantile_stream_resource_handle"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceFlush.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceFlush.pbtxt
index 2fd94efa1073c2..2584c373f39d45 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceFlush.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceFlush.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesQuantileStreamResourceFlush"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "quantile_stream_resource_handle"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceGetBucketBoundaries.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceGetBucketBoundaries.pbtxt
index 206672802f2041..aa3f67ce687bb5 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceGetBucketBoundaries.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceGetBucketBoundaries.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesQuantileStreamResourceGetBucketBoundaries"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "quantile_stream_resource_handle"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceHandleOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceHandleOp.pbtxt
index cb7786c051df8d..c598fb91e55fc8 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceHandleOp.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesQuantileStreamResourceHandleOp.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesQuantileStreamResourceHandleOp"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   summary: "Creates a handle to a BoostedTreesQuantileStreamResource."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesSerializeEnsemble.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesSerializeEnsemble.pbtxt
index c0b3688d8a3f23..d7ea7e5768e004 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesSerializeEnsemble.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesSerializeEnsemble.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesSerializeEnsemble"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "tree_ensemble_handle"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesSparseAggregateStats.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesSparseAggregateStats.pbtxt
index d282ce0111d5d7..8d8221f7ab84fe 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesSparseAggregateStats.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesSparseAggregateStats.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesSparseAggregateStats"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "node_ids"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesSparseCalculateBestFeatureSplit.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesSparseCalculateBestFeatureSplit.pbtxt
index 8ca52f168dda23..2c1bb86b1e56bb 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesSparseCalculateBestFeatureSplit.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesSparseCalculateBestFeatureSplit.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesSparseCalculateBestFeatureSplit"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "node_id_range"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt
index f8a3639c9b715c..40860081a0b8df 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesTrainingPredict.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesTrainingPredict"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "cached_tree_ids"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
index 3cf486d087d518..79e1a40e950650 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsemble.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesUpdateEnsemble"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "tree_ensemble_handle"
diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsembleV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsembleV2.pbtxt
index 66404dca4e584b..d5668840dcd572 100644
--- a/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsembleV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesUpdateEnsembleV2.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BoostedTreesUpdateEnsembleV2"
+  deprecation_message: "Boosted Trees kernels in TF are deprecated. Please use TensorFlow Decision Forests instead (https://github.com/tensorflow/decision-forests)"
   visibility: HIDDEN
   in_arg {
     name: "tree_ensemble_handle"
diff --git a/tensorflow/core/common_runtime/control_flow_deps_to_chains.cc b/tensorflow/core/common_runtime/control_flow_deps_to_chains.cc
index 4411c3b59bc722..dcc630e325bb4d 100644
--- a/tensorflow/core/common_runtime/control_flow_deps_to_chains.cc
+++ b/tensorflow/core/common_runtime/control_flow_deps_to_chains.cc
@@ -193,7 +193,7 @@ Status ControlFlowDepsToChainsPass::Run(
       modified_body.mutable_ret()->insert(
           {c_ret_name, strings::StrCat(c_out_name, ":output:0")});
       AttrValue attr_val;
-      attr_val.mutable_list()->mutable_shape();
+      attr_val.mutable_list()->add_shape();
       FunctionDef_ArgAttrs arg_attrs;
       arg_attrs.mutable_attr()->insert({"_output_shapes", attr_val});
       modified_body.mutable_arg_attr()->insert(
@@ -260,7 +260,7 @@ Status ControlFlowDepsToChainsPass::Run(
 
       // TODO(mdan): Return values on the cond function? Most likely a bug.
       AttrValue attr_val;
-      attr_val.mutable_list()->mutable_shape();
+      attr_val.mutable_list()->add_shape();
       FunctionDef_ArgAttrs arg_attrs;
       arg_attrs.mutable_attr()->insert({"_output_shapes", attr_val});
       modified_cond.mutable_arg_attr()->insert(
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index e804ff403a27bf..a4dc05286ce71b 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -431,6 +431,16 @@ void EagerContext::SetExecutorForThread(EagerExecutor* executor) {
             thread_local_executor_.erase(thread_id);
           }
           has_cleanup_[thread_id].erase(executor);
+          // Clears the global rendezvous after cleaning up the executor. This
+          // is needed when running in eager op as function mode because it
+          // re-uses the EagerContext's global_rendezvous_for_functions. The
+          // global rendezvous can end up in a bad state if any op ends in a
+          // bad state after execution.
+          if (!GetGlobalRendezvousForFunctionLocalRendezvousStatus().ok()) {
+            VLOG(6) << "global_rendezvous_for_functions_ is in bad state. "
+                       "Resetting.";
+            ResetGlobalRendezvousForFunction();
+          }
         }
       });
       executor->AddCleanup(reinterpret_cast<intptr_t>(this),
diff --git a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
index 6f38dcb46a2567..eba81a8232f449 100644
--- a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
+++ b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
@@ -622,8 +622,7 @@ Status UpdateContextWithServerDef(EagerContext* context,
     LOG_AND_RETURN_IF_ERROR(server->Start());
   } else {
     sg.Update(server->worker_env()->session_mgr->UpdateSession(
-        session_name, server_def, base_request.cluster_device_attributes(),
-        /*isolate_session_state=*/true));
+        session_name, server_def, base_request.cluster_device_attributes()));
     sg.Update(context->UpdateRemoteMaster(context_id,
                                           std::move(remote_eager_workers),
                                           added_workers, removed_workers));
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index bd14dca049ae23..222a8b9cbec934 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -482,6 +482,18 @@ Status BuildWrappedOpName(EagerOperation* op, const OpDef& opdef,
   return Status::OK();
 }
 
+// Validates the node def. This is required when running in eager op as function
+// mode because this code path does not go through the _apply_op_helper's
+// validation (which is reached when executing in graph mode)
+// or the eager execution's validation (which is reached via the CreateOpKernel
+// call).
+Status ValidateOp(EagerOperation* op) {
+  const NodeDef& node_def = op->MutableAttrs()->BuildNodeDef();
+  const OpDef* op_def;
+  TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUpOpDef(node_def.op(), &op_def));
+  return ValidateNodeDef(node_def, *op_def);
+}
+
 // Builds the signature of the wrapping FunctionDef for an eager op.
 //
 // For ops without variadic inputs/outputs, the signature is the same as the
@@ -631,7 +643,7 @@ Status BuildWrappedOpSignature(EagerOperation* op, const OpDef& opdef,
           return errors::Internal("Unable to read attr ", arg.number_attr(),
                                   " for op ", op->Name());
         }
-        for (size_t i = 0; i < number_attr; i++) {
+        for (int64_t i = 0; i < number_attr; i++) {
           auto arg_def = sig_args->Add();
           arg_def->set_name(GetFlatName(arg.name(), i));
           if (!arg.type_attr().empty()) {
@@ -819,6 +831,19 @@ Status WrapInCallOp(EagerOperation* op, EagerOperation** wrapped_op) {
   return AddMixedTypeListAttrs(*wrapped_op, op_attrs, opdef);
 }
 
+bool IntArgsAndRetvalsOnDevice(EagerOperation* op) {
+  // Most TF ops expect and generate int32 tensors on the host (or a TPU/XLA
+  // device). This is not the case with IteratorGetNext since it is possible to
+  // build int32 datasets that produce outputs on device when using
+  // prefetch_to_device.
+  // When running call ops, by default we assume that the int32 outputs are on a
+  // host (except for the XLA/TPU case). So we need to special case
+  // IteratorGetNext such that its eager behavior matches the wrapped one.
+  // TODO(b/208435025): Remove this if we end up deciding that int32 outputs
+  // from IteratorGetNext should indeed live on host.
+  return op->Name() == "IteratorGetNext";
+}
+
 Status GetOrCreateKernelAndDevice(
     EagerOperation* op, TensorHandle** retvals, int* num_retvals,
     core::RefCountPtr<KernelAndDevice>* out_kernel) {
@@ -988,15 +1013,18 @@ Status GetOrCreateKernelAndDevice(
     // expect unsupported ops to be outside compiled but that is not supported
     // on GPUs right now.
     bool allow_small_function_optimizations = false;
+    bool int_args_and_retvals_on_device = false;
     if (ctx.RunEagerOpAsFunction() && !op->is_function()) {
       EagerOperation* wrapped_op = nullptr;
+      TF_RETURN_IF_ERROR(ValidateOp(op));
       TF_RETURN_IF_ERROR(WrapInCallOp(op, &wrapped_op));
       DCHECK(wrapped_op);
       DCHECK(wrapped_op->is_function());
       wrapped_op_releaser.reset(wrapped_op);
-      op = wrapped_op;
       run_function_with_flr = true;
       allow_small_function_optimizations = true;
+      int_args_and_retvals_on_device = IntArgsAndRetvalsOnDevice(op);
+      op = wrapped_op;
     }
     const NodeDef& ndef = op->MutableAttrs()->BuildNodeDef();
 
@@ -1041,7 +1069,8 @@ Status GetOrCreateKernelAndDevice(
           std::move(input_resource_variable_dtypes_and_shapes), runner,
           ctx.GetCollectiveExecutorHandle(), ctx.HostCPU(), op->Name(),
           function_outputs_on_op_device, allow_small_function_optimizations,
-          std::move(rendezvous_creator), get_op_id));
+          int_args_and_retvals_on_device, std::move(rendezvous_creator),
+          get_op_id));
     } else {
       VLOG(2) << "Running " << ndef.op() << " using op kernel. "
               << ". Full node_def=" << ndef.DebugString();
diff --git a/tensorflow/core/common_runtime/eager/execute_node_test.cc b/tensorflow/core/common_runtime/eager/execute_node_test.cc
index 4564fec059279a..3ff0c4e964da57 100644
--- a/tensorflow/core/common_runtime/eager/execute_node_test.cc
+++ b/tensorflow/core/common_runtime/eager/execute_node_test.cc
@@ -39,6 +39,7 @@ class TestKernelAndDeviceFunc final : public KernelAndDeviceFunc {
             /*runner=*/nullptr, /*collective_executor=*/nullptr,
             host_cpu_device, /*name=*/"", /*outputs_on_op_device=*/false,
             /*allow_small_function_optimizations=*/false,
+            /*int_args_and_retvals_on_device=*/false,
             /*rendezvous_creator=*/nullptr, /*get_op_id=*/nullptr),
         test_input_devices_(std::move(input_devices)) {}
 
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 80170d8bea4699..f1e928511c5c7a 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -224,6 +224,8 @@ Status KernelAndDeviceFunc::InstantiateFunc(const bool log_device_placement,
 
   options.config_proto.set_log_device_placement(log_device_placement);
 
+  options.int_args_and_retvals_on_device = int_args_and_retvals_on_device_;
+
   TF_RETURN_IF_ERROR(
       pflr_->Instantiate(ndef.op(), AttrSlice(ndef), options, &handle_));
   return pflr_->IsCrossProcess(handle_, &is_cross_process_);
@@ -423,7 +425,12 @@ Status KernelAndDeviceFunc::Run(
                     eager_func_params, stack_trace, coordination_service_agent);
 
   std::vector<Tensor> rets;
-  Status s = pflr_->RunSync(*opts, handle_, inputs.GetLocalTensors(), &rets);
+  Status s;
+  {
+    port::ScopedFlushDenormal flush;
+    port::ScopedSetRound round(FE_TONEAREST);
+    s.Update(pflr_->RunSync(*opts, handle_, inputs.GetLocalTensors(), &rets));
+  }
 
   if (cancellation_manager == nullptr) {
     delete opts->cancellation_manager;
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index 4d01dcb8dd0dab..cf7686069d1788 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -276,6 +276,7 @@ class KernelAndDeviceFunc : public KernelAndDevice {
       Device* host_cpu_device, const string& name,
       const bool outputs_on_op_device,
       const bool allow_small_function_optimizations,
+      const bool int_args_and_retvals_on_device,
       std::function<Rendezvous*(const int64_t)> rendezvous_creator,
       std::function<int64_t()> get_op_id)
       : KernelAndDevice(flr, runner, std::move(collective_executor),
@@ -284,6 +285,7 @@ class KernelAndDeviceFunc : public KernelAndDevice {
         handle_(kInvalidHandle),
         outputs_on_op_device_(outputs_on_op_device),
         allow_small_function_optimizations_(allow_small_function_optimizations),
+        int_args_and_retvals_on_device_(int_args_and_retvals_on_device),
         input_devices_(std::move(input_devices)),
         composite_devices_(std::move(composite_devices)),
         input_resource_dtypes_and_shapes_(
@@ -355,6 +357,8 @@ class KernelAndDeviceFunc : public KernelAndDevice {
   // be faster under some conditions.)
   const bool allow_small_function_optimizations_;
 
+  const bool int_args_and_retvals_on_device_;
+
   // CPU devices are null. Resource handles' devices are actual backing
   // devices.
   std::vector<Device*> output_devices_;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index 9459c6797570f1..1fa59b8c53b651 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -114,7 +114,7 @@ class GPUDeviceTest : public ::testing::Test {
   }
 };
 
-TEST_F(GPUDeviceTest, CudaMallocAsync) {
+TEST_F(GPUDeviceTest, DISABLED_ON_GPU_ROCM(CudaMallocAsync)) {
   // cudaMallocAsync supported only when cuda toolkit and driver supporting
   // CUDA 11.2+
 #ifndef GOOGLE_CUDA
diff --git a/tensorflow/core/common_runtime/partitioning_utils.cc b/tensorflow/core/common_runtime/partitioning_utils.cc
index 1a004330274223..0c23128b3d5b27 100644
--- a/tensorflow/core/common_runtime/partitioning_utils.cc
+++ b/tensorflow/core/common_runtime/partitioning_utils.cc
@@ -73,10 +73,10 @@ Status PartitionFunctionGraph(
 }
 
 Status UpdateArgAndRetvalMetadata(
-    Graph* graph, const string& device_type,
-    std::vector<FunctionArgIndex>* arg_indices, std::vector<int>* ret_indices,
+    Graph* graph, std::vector<FunctionArgIndex>* arg_indices,
+    std::vector<int>* ret_indices,
     std::vector<AllocatorAttributes>* arg_alloc_attrs,
-    std::vector<AllocatorAttributes>* ret_alloc_attrs) {
+    std::vector<AllocatorAttributes>* ret_alloc_attrs, bool ints_on_device) {
   std::vector<std::pair<Node*, FunctionArgIndex>> arg_nodes;
   std::vector<std::pair<Node*, int>> ret_nodes;
   const AttrValue* attr_value;
@@ -126,10 +126,8 @@ Status UpdateArgAndRetvalMetadata(
     if (arg_alloc_attrs != nullptr) {
       AllocatorAttributes alloc_attr;
       DataType type = attr_value->type();
-      MemoryType mtype = (device_type == "TPU" || device_type == "XLA_CPU" ||
-                          device_type == "XLA_GPU")
-                             ? MTypeFromDTypeIntsOnDevice(type)
-                             : MTypeFromDType(type);
+      MemoryType mtype = ints_on_device ? MTypeFromDTypeIntsOnDevice(type)
+                                        : MTypeFromDType(type);
       if (mtype == HOST_MEMORY) {
         alloc_attr.set_on_host(true);
       }
@@ -143,10 +141,8 @@ Status UpdateArgAndRetvalMetadata(
     if (ret_alloc_attrs) {
       AllocatorAttributes alloc_attr;
       DataType type = attr_value->type();
-      MemoryType mtype = (device_type == "TPU" || device_type == "XLA_CPU" ||
-                          device_type == "XLA_GPU")
-                             ? MTypeFromDTypeIntsOnDevice(type)
-                             : MTypeFromDType(type);
+      MemoryType mtype = ints_on_device ? MTypeFromDTypeIntsOnDevice(type)
+                                        : MTypeFromDType(type);
       if (mtype == HOST_MEMORY) {
         alloc_attr.set_on_host(true);
       }
diff --git a/tensorflow/core/common_runtime/partitioning_utils.h b/tensorflow/core/common_runtime/partitioning_utils.h
index 45866a8200741e..32ac871ff7a289 100644
--- a/tensorflow/core/common_runtime/partitioning_utils.h
+++ b/tensorflow/core/common_runtime/partitioning_utils.h
@@ -63,12 +63,16 @@ Status PartitionFunctionGraph(
 //  (2) records the subsets of `Arg` and `Retval` nodes assigned to the
 //      device in `*_indices`, and
 //  (3) records which `Arg` and `Retval` nodes live in host memory in
-//      `*_alloc_attrs`. If these vectors are NULL, do nothing here.
+//      `*_alloc_attrs`. If these vectors are NULL, do nothing here. If
+//      `ints_on_device` is false, int32 `Arg` and `Retval` nodes are placed on
+//      host else not. This is needed because in certain special cases e.g.
+//      when graph is placed on TPU/XLA device or when the `Retval` is an output
+//      of an iterator, int32 tensors live on device.
 Status UpdateArgAndRetvalMetadata(
-    Graph* graph, const string& device_type,
-    std::vector<FunctionArgIndex>* arg_indices, std::vector<int>* ret_indices,
+    Graph* graph, std::vector<FunctionArgIndex>* arg_indices,
+    std::vector<int>* ret_indices,
     std::vector<AllocatorAttributes>* arg_alloc_attrs,
-    std::vector<AllocatorAttributes>* ret_alloc_attrs);
+    std::vector<AllocatorAttributes>* ret_alloc_attrs, bool ints_on_device);
 
 // Utility for generating function names not present in `flib_def`, using
 // given `name` as the base for the name.
diff --git a/tensorflow/core/common_runtime/partitioning_utils_test.cc b/tensorflow/core/common_runtime/partitioning_utils_test.cc
index 6639740f59ddcc..acf6898d9db5c9 100644
--- a/tensorflow/core/common_runtime/partitioning_utils_test.cc
+++ b/tensorflow/core/common_runtime/partitioning_utils_test.cc
@@ -202,11 +202,9 @@ TEST_F(PartitioningUtilsTest, UpdateArgsAndRets) {
   std::vector<AllocatorAttributes> arg_alloc_attrs;
   std::vector<AllocatorAttributes> ret_alloc_attrs;
 
-  string device_type = "CPU";
-
   Status status = UpdateArgAndRetvalMetadata(
-      graph.get(), device_type, &arg_indices, &ret_indices, &arg_alloc_attrs,
-      &ret_alloc_attrs);
+      graph.get(), &arg_indices, &ret_indices, &arg_alloc_attrs,
+      &ret_alloc_attrs, /*ints_on_device=*/false);
   ASSERT_TRUE(status.ok()) << status.ToString();
 
   CheckArgIndices({{3, -1}}, arg_indices);
@@ -221,6 +219,42 @@ TEST_F(PartitioningUtilsTest, UpdateArgsAndRets) {
   CheckIndex(*nodes["retval1"], 0);
 }
 
+TEST_F(PartitioningUtilsTest, UpdateArgsAndRetsIntsNotOnDevice) {
+  auto graph = absl::make_unique<Graph>(OpRegistry::Global());
+  SubGraph(graph.get(), DT_INT32, {3}, {5});
+
+  std::vector<FunctionArgIndex> arg_indices;
+  std::vector<int> ret_indices;
+  std::vector<AllocatorAttributes> arg_alloc_attrs;
+  std::vector<AllocatorAttributes> ret_alloc_attrs;
+
+  Status status = UpdateArgAndRetvalMetadata(
+      graph.get(), &arg_indices, &ret_indices, &arg_alloc_attrs,
+      &ret_alloc_attrs, /*ints_on_device=*/false);
+  ASSERT_TRUE(status.ok()) << status.ToString();
+
+  CheckAlloc({true}, arg_alloc_attrs);
+  CheckAlloc({true}, ret_alloc_attrs);
+}
+
+TEST_F(PartitioningUtilsTest, UpdateArgsAndRetsIntsOnDevice) {
+  auto graph = absl::make_unique<Graph>(OpRegistry::Global());
+  SubGraph(graph.get(), DT_INT32, {3}, {5});
+
+  std::vector<FunctionArgIndex> arg_indices;
+  std::vector<int> ret_indices;
+  std::vector<AllocatorAttributes> arg_alloc_attrs;
+  std::vector<AllocatorAttributes> ret_alloc_attrs;
+
+  Status status = UpdateArgAndRetvalMetadata(
+      graph.get(), &arg_indices, &ret_indices, &arg_alloc_attrs,
+      &ret_alloc_attrs, /*ints_on_device=*/true);
+  ASSERT_TRUE(status.ok()) << status.ToString();
+
+  CheckAlloc({false}, arg_alloc_attrs);
+  CheckAlloc({false}, ret_alloc_attrs);
+}
+
 TEST_F(PartitioningUtilsTest, UpdateArgsAndRets_Order) {
   auto graph = absl::make_unique<Graph>(OpRegistry::Global());
   SubGraph(graph.get(), DT_FLOAT, {9, 7, 5, 3, 1}, {2, 4, 6, 8, 10});
@@ -241,11 +275,9 @@ TEST_F(PartitioningUtilsTest, UpdateArgsAndRets_Order) {
   std::vector<AllocatorAttributes> arg_alloc_attrs;
   std::vector<AllocatorAttributes> ret_alloc_attrs;
 
-  string device_type = "CPU";
-
   Status status = UpdateArgAndRetvalMetadata(
-      graph.get(), device_type, &arg_indices, &ret_indices, &arg_alloc_attrs,
-      &ret_alloc_attrs);
+      graph.get(), &arg_indices, &ret_indices, &arg_alloc_attrs,
+      &ret_alloc_attrs, /*ints_on_device=*/false);
   ASSERT_TRUE(status.ok()) << status.ToString();
 
   CheckArgIndices({{1, 0}, {3, 1}, {5, 2}, {7, 2}, {9, 0}}, arg_indices);
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 974dfc622efa1f..c29a1ee93314af 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -1055,10 +1055,13 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
           dev_set->FindDeviceByName(target)->device_type();
       Graph* subgraph = pair.second.get();
 
+      bool ints_on_device =
+          (device_type == "TPU" || device_type == "XLA_CPU" ||
+           device_type == "XLA_GPU" || options.int_args_and_retvals_on_device);
       status->Update(UpdateArgAndRetvalMetadata(
-          subgraph, device_type, &comp_data->arg_indices,
-          &comp_data->ret_indices, &comp_data->arg_alloc_attrs,
-          &comp_data->ret_alloc_attrs));
+          subgraph, &comp_data->arg_indices, &comp_data->ret_indices,
+          &comp_data->arg_alloc_attrs, &comp_data->ret_alloc_attrs,
+          ints_on_device));
       if (!status->ok()) {
         counter.DecrementCount();
         return;
@@ -1300,6 +1303,7 @@ Status ProcessFunctionLibraryRuntime::RunMultiDeviceSync(
         const string function_and_msg = strings::StrCat(
             errors::FormatFunctionForError(data->function_name_), " ",
             run_status.error_message());
+        if (opts.rendezvous != nullptr) opts.rendezvous->StartAbort(run_status);
         return errors::CreateWithUpdatedMessage(run_status, function_and_msg);
       } else {
         VLOG(2) << "Component function execution succeeded.";
diff --git a/tensorflow/core/data/BUILD b/tensorflow/core/data/BUILD
index ea0828fcf46a06..432651c9084893 100644
--- a/tensorflow/core/data/BUILD
+++ b/tensorflow/core/data/BUILD
@@ -24,8 +24,6 @@ exports_files([
     "captured_function.h",
     "dataset_utils.cc",
     "dataset_utils.h",
-    "file_utils.cc",
-    "file_utils.h",
     "name_utils.cc",
     "name_utils.h",
     "rewrite_utils.cc",
@@ -40,6 +38,8 @@ exports_files([
     "stats_utils.h",
     "unbounded_thread_pool.cc",
     "unbounded_thread_pool.h",
+    "utils.cc",
+    "utils.h",
 ])
 
 cc_library(
@@ -455,7 +455,10 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "file_utils",
-    srcs = ["file_utils.cc"],
-    hdrs = ["file_utils.h"],
+    name = "utils",
+    srcs = ["utils.cc"],
+    hdrs = ["utils.h"],
+    deps = [
+        "//tensorflow/core:framework",
+    ],
 )
diff --git a/tensorflow/core/data/dataset_utils.cc b/tensorflow/core/data/dataset_utils.cc
index a5440b06b3cfcd..9c0b3ef77166a0 100644
--- a/tensorflow/core/data/dataset_utils.cc
+++ b/tensorflow/core/data/dataset_utils.cc
@@ -881,7 +881,7 @@ absl::flat_hash_map<string, int64_t> DatasetExperimentRegistry::Experiments() {
 
 namespace {
 
-REGISTER_DATASET_EXPERIMENT("enable_bufferedio_v2", 0);
+REGISTER_DATASET_EXPERIMENT("enable_bufferedio_v2", 5);
 REGISTER_DATASET_EXPERIMENT("inject_prefetch", 5);
 REGISTER_DATASET_EXPERIMENT("max_parallelism", 100);
 REGISTER_DATASET_EXPERIMENT("max_parallelism_v2", 50);
diff --git a/tensorflow/core/data/root_dataset.cc b/tensorflow/core/data/root_dataset.cc
index e48b31540ac572..8077297fefbc9f 100644
--- a/tensorflow/core/data/root_dataset.cc
+++ b/tensorflow/core/data/root_dataset.cc
@@ -280,6 +280,10 @@ int64_t RootDataset::CardinalityInternal() const {
   return input_->Cardinality();
 }
 
+int64_t RootDataset::CardinalityInternal(CardinalityOptions options) const {
+  return input_->Cardinality(options);
+}
+
 Status RootDataset::Get(OpKernelContext* ctx, int64 index,
                         std::vector<Tensor>* out_tensors) const {
   std::vector<const DatasetBase*> inputs;
diff --git a/tensorflow/core/data/root_dataset.h b/tensorflow/core/data/root_dataset.h
index c234994910f8f5..6f966a8c1bb977 100644
--- a/tensorflow/core/data/root_dataset.h
+++ b/tensorflow/core/data/root_dataset.h
@@ -43,6 +43,7 @@ class RootDataset : public DatasetBase {
   const std::vector<PartialTensorShape>& output_shapes() const override;
 
   int64_t CardinalityInternal() const override;
+  int64_t CardinalityInternal(CardinalityOptions options) const override;
   Status Get(OpKernelContext* ctx, int64 index,
              std::vector<Tensor>* out_tensors) const override;
   Status CheckExternalState() const override;
diff --git a/tensorflow/core/data/file_utils.cc b/tensorflow/core/data/utils.cc
similarity index 75%
rename from tensorflow/core/data/file_utils.cc
rename to tensorflow/core/data/utils.cc
index 642fa377d7015a..aa54305d6e77f1 100644
--- a/tensorflow/core/data/file_utils.cc
+++ b/tensorflow/core/data/utils.cc
@@ -12,16 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/data/file_utils.h"
+#include "tensorflow/core/data/utils.h"
 
 #include <string>
 
+#include "tensorflow/core/framework/metrics.h"
+
 namespace tensorflow {
 namespace data {
-namespace file_utils {
+
+void AddLatencySample(int64_t microseconds) {
+  metrics::RecordTFDataGetNextDuration(microseconds);
+}
+
+void IncrementThroughput(int64_t bytes) {
+  metrics::RecordTFDataBytesFetched(bytes);
+}
 
 std::string TranslateFileName(const std::string& fname) { return fname; }
 
-}  // namespace file_utils
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/file_utils.h b/tensorflow/core/data/utils.h
similarity index 75%
rename from tensorflow/core/data/file_utils.h
rename to tensorflow/core/data/utils.h
index 1265e8dc28321b..ac7c45f520f7be 100644
--- a/tensorflow/core/data/file_utils.h
+++ b/tensorflow/core/data/utils.h
@@ -12,21 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_DATA_FILE_UTILS_H_
-#define TENSORFLOW_CORE_DATA_FILE_UTILS_H_
+#ifndef TENSORFLOW_CORE_DATA_UTILS_H_
+#define TENSORFLOW_CORE_DATA_UTILS_H_
 
 #include <string>
 
 namespace tensorflow {
 namespace data {
-namespace file_utils {
+
+// Records latency of fetching data from tf.data iterator.
+void AddLatencySample(int64_t microseconds);
+
+// Records bytes produced by a tf.data iterator.
+void IncrementThroughput(int64_t bytes);
 
 // Returns a modified file name that can be used to do implementation specific
 // file name manipulation/optimization.
 std::string TranslateFileName(const std::string& fname);
 
-}  // namespace file_utils
 }  // namespace data
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_DATA_PORT_UTILS_H_
+#endif  // TENSORFLOW_CORE_DATA_UTILS_H_
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index e3be9953ba53a4..b400c5e1c979f3 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -414,28 +414,28 @@ void BaseRemoteRendezvous::RegisterCall(BaseRecvTensorCall* call,
   bool already_cancelled = false;
   InactiveCallback callback = [] {};
   {
-    mutex_lock l(mu_);
+    tf_shared_lock l(mu_);
     if (!status_.ok()) {
       call->StartAbort(status_);
       return;
     }
-    if (cm != nullptr) {
-      auto token = cm->get_cancellation_token();
-      already_cancelled = !cm->RegisterCallback(token, [this, call] {
-        {
-          mutex_lock l(mu_);
-          if (active_.find(call) == active_.end()) return;
-          call->StartAbort(
-              errors::Cancelled("RecvFromRemoteAsync is cancelled."));
-        }
-      });
-      callback = [cm, token] { cm->TryDeregisterCallback(token); };
-    }
-    if (already_cancelled) {
+  }
+  if (cm != nullptr) {
+    auto token = cm->get_cancellation_token();
+    already_cancelled = !cm->RegisterCallback(token, [this, call] {
+      {
+        tf_shared_lock l(mu_);
+        if (active_.find(call) == active_.end()) return;
+      }
       call->StartAbort(errors::Cancelled("RecvFromRemoteAsync is cancelled."));
-    } else {
-      CHECK(active_.emplace(call, callback).second);
-    }
+    });
+    callback = [cm, token] { cm->TryDeregisterCallback(token); };
+  }
+  if (already_cancelled) {
+    call->StartAbort(errors::Cancelled("RecvFromRemoteAsync is cancelled."));
+  } else {
+    mutex_lock l(mu_);
+    CHECK(active_.emplace(call, callback).second);  // Crash OK.
   }
 }
 
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
index c0b2c8152668de..343cd713f817df 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
@@ -123,6 +123,13 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
   // Upgrades the BaseRemoteRendezvous to full initialization.
   Status Initialize(WorkerSession* session) override;
 
+  void SetRemoteEagerContextDefault() override {
+    remote_eager_context_default_ = true;
+  }
+  bool IsRemoteEagerContextDefault() override {
+    return remote_eager_context_default_;
+  }
+
   // Forwards to local_, where the Tensor "val" will be buffered and
   // any waiting callback stored.
   Status Send(const ParsedKey& key, const Rendezvous::Args& args,
@@ -176,6 +183,12 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
 
  private:
   Rendezvous* local_;  // Owns a Ref on this object.
+  // Indicates whether this remote rendezvous instance is used as the default
+  // rendezvous for remote eager op-by-op execution. Errors in eager op-by-op
+  // execution should not abort the rendezvous since it is a context-wide
+  // instance and needs to be reused; instead, the errors are propagated through
+  // eager executors.
+  bool remote_eager_context_default_ = false;
 
   mutable mutex mu_;
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index 5c1c37995f507c..b781263f216d92 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -262,6 +262,8 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
 
   // Initialize remote tensor communication based on worker session.
   TF_RETURN_IF_ERROR(r->Initialize(worker_session.get()));
+  // Set the rendezvous as context-global instance for eager op-by-op execution.
+  r->SetRemoteEagerContextDefault();
 
   std::function<Rendezvous*(const int64_t)> rendezvous_creator =
       [worker_session, this](const int64_t step_id) {
@@ -387,9 +389,9 @@ Status EagerServiceImpl::UpdateContext(const UpdateContextRequest* request,
   auto session_name =
       tensorflow::strings::StrCat("eager_", request->context_id());
 
-  TF_RETURN_IF_ERROR(env_->session_mgr->UpdateSession(
-      session_name, request->server_def(), request->cluster_device_attributes(),
-      true));
+  TF_RETURN_IF_ERROR(
+      env_->session_mgr->UpdateSession(session_name, request->server_def(),
+                                       request->cluster_device_attributes()));
 
   std::shared_ptr<WorkerSession> worker_session;
   TF_RETURN_IF_ERROR(env_->session_mgr->WorkerSessionForSession(
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 20d7cdd2a222e1..089373ba1a393b 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -955,7 +955,8 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncTest) {
       /*runner=*/nullptr,
       /*collective_executor=*/nullptr, local_device, fdef_.signature().name(),
       /*outputs_on_op_device=*/false,
-      /*allow_small_function_optimizations=*/false, ctx->RendezvousCreator(),
+      /*allow_small_function_optimizations=*/false,
+      /*int_args_and_retvals_on_device=*/false, ctx->RendezvousCreator(),
       [=]() { return op_id; }));
 
   // Instantiate MatMulFunction on remote_device.
@@ -1005,7 +1006,8 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncAsyncTest) {
       /*runner=*/nullptr,
       /*collective_executor=*/nullptr, local_device, fdef_.signature().name(),
       /*outputs_on_op_device=*/false,
-      /*allow_small_function_optimizations=*/false, ctx->RendezvousCreator(),
+      /*allow_small_function_optimizations=*/false,
+      /*int_args_and_retvals_on_device=*/false, ctx->RendezvousCreator(),
       [=]() { return op_id; }));
 
   // Instantiate MatMulFunction on remote_device.
diff --git a/tensorflow/core/distributed_runtime/integration_test/BUILD b/tensorflow/core/distributed_runtime/integration_test/BUILD
new file mode 100644
index 00000000000000..8d919b7c11c2f4
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/integration_test/BUILD
@@ -0,0 +1,141 @@
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+    "tf_cuda_cc_test",
+    "tf_cuda_library",
+)
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],
+)
+
+tf_cuda_library(
+    name = "coordination_test_opkernel_registration",
+    testonly = 1,
+    srcs = ["coordination_test_opkernel_registration.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core/distributed_runtime/coordination:coordination_service_agent",
+    ],
+    alwayslink = 1,
+)
+
+tf_cuda_cc_test(
+    name = "c_api_coordination_test",
+    size = "small",
+    srcs = ["c_api_coordination_test.cc"],
+    tags = [
+        "no_cuda_asan",  # TODO(b/193450885)
+        "no_windows",  # TODO(b/207281588)
+    ],
+    deps = [
+        ":coordination_test_opkernel_registration",
+        "//tensorflow/c:c_api_experimental",
+        "//tensorflow/c:c_test_util",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental",
+        "//tensorflow/c/eager:c_api_internal",
+        "//tensorflow/c/eager:c_api_test_util",
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/distributed_runtime/coordination:coordination_service",
+        "//tensorflow/core/distributed_runtime/coordination:coordination_service_agent",
+        "//tensorflow/core/platform:blocking_counter",
+        "//tensorflow/core/platform:env",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "c_api_session_coordination_test",
+    size = "small",
+    srcs = ["c_api_session_coordination_test.cc"],
+    tags = [
+        "no_windows",  # TODO(b/207281588)
+    ],
+    deps = [
+        ":coordination_test_opkernel_registration",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:c_api_experimental",
+        "//tensorflow/c:c_test_util",
+        "//tensorflow/c/eager:c_api_test_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/distributed_runtime/coordination:coordination_service",
+        "//tensorflow/core/distributed_runtime/coordination:coordination_service_agent",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_session",
+        "//tensorflow/core/platform:env",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "c_api_multi_client_test",
+    size = "small",
+    srcs = ["c_api_multi_client_test.cc"],
+    tags = [
+        "no_windows",  # TODO(b/207281588)
+    ],
+    deps = [
+        "//tensorflow/c:c_api_experimental",
+        "//tensorflow/c:c_test_util",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental",
+        "//tensorflow/c/eager:c_api_internal",
+        "//tensorflow/c/eager:c_api_test_util",
+        "//tensorflow/c/eager:tfe_context_internal",
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/distributed_runtime/coordination:coordination_service",
+        "//tensorflow/core/distributed_runtime/coordination:coordination_service_agent",
+        "//tensorflow/core/platform:blocking_counter",
+        "//tensorflow/core/platform:env",
+    ],
+)
+
+tf_cc_test(
+    name = "c_api_multi_client_function_test",
+    size = "small",
+    srcs = ["c_api_multi_client_function_test.cc"],
+    tags = [
+        "no_oss",  # test uses TFRT
+    ],
+    deps = [
+        "//tensorflow/c:c_api_experimental",
+        "//tensorflow/c:c_test_util",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental",
+        "//tensorflow/c/eager:c_api_internal",
+        "//tensorflow/c/eager:c_api_test_util",
+        "//tensorflow/c/eager:tfe_context_internal",
+        "//tensorflow/c/eager:tfe_op_internal",
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/common_runtime/eager:eager_operation",
+        "//tensorflow/core/common_runtime/eager:kernel_and_device",
+        "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/distributed_runtime/coordination:coordination_service",
+        "//tensorflow/core/distributed_runtime/coordination:coordination_service_agent",
+        "//tensorflow/core/platform:blocking_counter",
+        "//tensorflow/core/platform:env",
+    ],
+)
diff --git a/tensorflow/core/distributed_runtime/integration_test/c_api_coordination_test.cc b/tensorflow/core/distributed_runtime/integration_test/c_api_coordination_test.cc
new file mode 100644
index 00000000000000..cb4a8fd4230910
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/integration_test/c_api_coordination_test.cc
@@ -0,0 +1,545 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/core/distributed_runtime/coordination/coordination_service.h"
+#include "tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/platform/blocking_counter.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
+#include "tensorflow/core/protobuf/coordination_config.pb.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+
+namespace tensorflow {
+namespace {
+
+constexpr char kCoordinationServiceType[] = "standalone";
+
+void EnableCoordinationService(tensorflow::ServerDef* server_def) {
+  auto coord_config = server_def->mutable_default_session_config()
+                          ->mutable_experimental()
+                          ->mutable_coordination_config();
+  coord_config->set_service_type(kCoordinationServiceType);
+  coord_config->set_service_leader("/job:worker/replica:0/task:0");
+  coord_config->set_heartbeat_timeout_in_ms(5 * 1000);  // 5 seconds
+}
+
+string SetConfigKeyValueFn() {
+  FunctionDef fdef;
+  tensorflow::protobuf::TextFormat::ParseFromString(
+      "    signature {"
+      "      name: 'SetConfigKeyValueFn'"
+      "      input_arg {"
+      "        name: 'config_key'"
+      "        type: DT_STRING"
+      "      }"
+      "      input_arg {"
+      "        name: 'config_value'"
+      "        type: DT_STRING"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'set0'"
+      "      op: 'TestSetConfigKeyValue'"
+      "      input: 'config_key'"
+      "      input: 'config_value'"
+      "    }"
+      "    ret {"
+      "    }",
+      &fdef);
+  return fdef.SerializeAsString();
+}
+
+string GetConfigKeyValueFn() {
+  FunctionDef fdef;
+  tensorflow::protobuf::TextFormat::ParseFromString(
+      "    signature {"
+      "      name: 'GetConfigKeyValueFn'"
+      "      input_arg {"
+      "        name: 'config_key'"
+      "        type: DT_STRING"
+      "      }"
+      "      output_arg {"
+      "        name: 'config_value'"
+      "        type: DT_STRING"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'get0'"
+      "      op: 'TestGetConfigKeyValue'"
+      "      input: 'config_key'"
+      "    }"
+      "    ret {"
+      "      key: 'config_value'"
+      "      value: 'get0:value:0'"
+      "    }",
+      &fdef);
+  return fdef.SerializeAsString();
+}
+
+TEST(CAPI, MultiClientCoordinationService) {
+  const int cluster_size = 3;
+  tensorflow::ServerDef server_def =
+      GetMultiClientServerDef("worker", cluster_size);
+  EnableCoordinationService(&server_def);
+  auto worker_thread_fn = [&](int worker_id) {
+    tensorflow::ServerDef server_def_copy = server_def;
+    // By default, server_def has task index set to 0.
+    server_def_copy.set_task_index(worker_id);
+    std::string serialized = server_def_copy.SerializeAsString();
+
+    TF_Status* status = TF_NewStatus();
+    TFE_ContextOptions* opts = TFE_NewContextOptions();
+    TFE_ContextOptionsSetAsync(opts,
+                               static_cast<unsigned char>(/*enable=*/true));
+    TFE_ContextOptionsSetDevicePlacementPolicy(opts,
+                                               TFE_DEVICE_PLACEMENT_SILENT);
+    TFE_Context* ctx = TFE_NewContext(opts, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_DeleteContextOptions(opts);
+
+    TFE_EnableCollectiveOps(ctx, serialized.data(), serialized.size(), status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+    // Normal execution: all cluster members are online.
+    std::this_thread::sleep_for(std::chrono::seconds(5));
+    TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+    TFE_ExecutorWaitForAllPendingNodes(executor, status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+    // Sleep for 10 seconds and run colletive ops on cluster except worker/1.
+    // Since worker/1 thread directly exits here, its heartbeat will expire,
+    // leading to UnavailableError on leader and then propagate to all other
+    // members in cluster.
+    if (worker_id != 1) {
+      // Wait for 10 seconds, during this period of time worker/1 exits and
+      // its heartbeat will expire.
+      std::this_thread::sleep_for(std::chrono::seconds(10));
+      TFE_TensorHandle* in = TestMatrixTensorHandle(ctx);
+      TFE_Op* allreduce = AllReduceOp(ctx, in, cluster_size);
+      TFE_TensorHandle* retvals[1];
+      int num_retvals = 1;
+      TFE_Execute(allreduce, &retvals[0], &num_retvals, status);
+      EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+      TFE_DeleteTensorHandle(in);
+      TFE_DeleteTensorHandle(retvals[0]);
+      TFE_DeleteOp(allreduce);
+
+      // Since we created async executor, op status is eventually reported at
+      // the sync barrier.
+      TFE_ExecutorWaitForAllPendingNodes(executor, status);
+      ASSERT_EQ(TF_UNAVAILABLE, TF_GetCode(status)) << TF_Message(status);
+    }
+    TFE_DeleteExecutor(executor);
+    TFE_DeleteContext(ctx);
+    TF_DeleteStatus(status);
+  };
+  std::thread thread_worker1([&] { worker_thread_fn(0); });
+  std::thread thread_worker2([&] { worker_thread_fn(1); });
+  std::thread thread_worker3([&] { worker_thread_fn(2); });
+  thread_worker1.join();
+  thread_worker2.join();
+  thread_worker3.join();
+}
+
+TEST(CAPI, MultiClientSetGetConfigInOp) {
+  const int cluster_size = 3;
+  tensorflow::ServerDef server_def =
+      GetMultiClientServerDef("worker", cluster_size);
+  EnableCoordinationService(&server_def);
+  BlockingCounter finish_counter(cluster_size);
+  auto worker_thread_fn = [&](int worker_id) {
+    tensorflow::ServerDef server_def_copy = server_def;
+    // By default, server_def has task index set to 0.
+    server_def_copy.set_task_index(worker_id);
+    std::string serialized = server_def_copy.SerializeAsString();
+
+    TF_Status* status = TF_NewStatus();
+    TFE_ContextOptions* opts = TFE_NewContextOptions();
+    TFE_ContextOptionsSetAsync(opts,
+                               static_cast<unsigned char>(/*enable=*/true));
+    TFE_ContextOptionsSetDevicePlacementPolicy(opts,
+                                               TFE_DEVICE_PLACEMENT_SILENT);
+    TFE_Context* ctx = TFE_NewContext(opts, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_DeleteContextOptions(opts);
+
+    TFE_EnableCollectiveOps(ctx, serialized.data(), serialized.size(), status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+    TFE_Op* set_op = TFE_NewOp(ctx, "TestSetConfigKeyValue", status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_TensorHandle* my_key = TestScalarTensorHandle(
+        ctx, tstring(strings::StrCat("worker_", worker_id)));
+    TFE_OpAddInput(set_op, my_key, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_TensorHandle* my_val = TestScalarTensorHandle(
+        ctx, tstring(strings::StrCat("value_", worker_id)));
+    TFE_OpAddInput(set_op, my_val, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    int num_retvals = 0;
+    TFE_Execute(set_op, nullptr, &num_retvals, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_DeleteTensorHandle(my_key);
+    TFE_DeleteTensorHandle(my_val);
+    TFE_DeleteOp(set_op);
+
+    TFE_Op* get_op = TFE_NewOp(ctx, "TestGetConfigKeyValue", status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_TensorHandle* next_key = TestScalarTensorHandle(
+        ctx,
+        tstring(strings::StrCat("worker_", (worker_id + 1) % cluster_size)));
+    TFE_OpAddInput(get_op, next_key, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+    TFE_TensorHandle* retvals[1];
+    num_retvals = 1;
+    TFE_Execute(get_op, retvals, &num_retvals, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+    TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    const tstring& next_val = *static_cast<tstring*>(TF_TensorData(t));
+    const tstring& expected_val =
+        tstring(strings::StrCat("value_", (worker_id + 1) % cluster_size));
+    EXPECT_EQ(next_val, expected_val) << strings::StrCat(
+        "Expecting value ", expected_val, ", but got ", next_val);
+
+    TFE_DeleteTensorHandle(next_key);
+    TFE_DeleteTensorHandle(retvals[0]);
+    TF_DeleteTensor(t);
+    TFE_DeleteOp(get_op);
+
+    // Since we created async executor, op status is eventually reported at
+    // the sync barrier.
+    TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+    TFE_ExecutorWaitForAllPendingNodes(executor, status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TF_DeleteStatus(status);
+    finish_counter.DecrementCount();
+    finish_counter.Wait();
+    TFE_DeleteExecutor(executor);
+    TFE_DeleteContext(ctx);
+  };
+  std::thread thread_worker1([&] { worker_thread_fn(0); });
+  std::thread thread_worker2([&] { worker_thread_fn(1); });
+  std::thread thread_worker3([&] { worker_thread_fn(2); });
+  thread_worker1.join();
+  thread_worker2.join();
+  thread_worker3.join();
+}
+
+TEST(CAPI, MultiClientCoordinationSetGetConfigs) {
+  const int cluster_size = 3;
+  tensorflow::ServerDef server_def =
+      GetMultiClientServerDef("worker", cluster_size);
+  EnableCoordinationService(&server_def);
+  tensorflow::BlockingCounter counter1(cluster_size);
+  tensorflow::BlockingCounter counter2(cluster_size);
+  tensorflow::BlockingCounter counter3(cluster_size);
+
+  auto worker_thread_fn = [&](int worker_id) {
+    tensorflow::ServerDef server_def_copy = server_def;
+    // By default, server_def has task index set to 0.
+    server_def_copy.set_task_index(worker_id);
+    std::string serialized = server_def_copy.SerializeAsString();
+
+    TF_Status* status = TF_NewStatus();
+    TFE_ContextOptions* opts = TFE_NewContextOptions();
+    TFE_ContextOptionsSetAsync(opts,
+                               static_cast<unsigned char>(/*enable=*/true));
+    TFE_ContextOptionsSetDevicePlacementPolicy(opts,
+                                               TFE_DEVICE_PLACEMENT_SILENT);
+    TFE_Context* ctx = TFE_NewContext(opts, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_DeleteContextOptions(opts);
+
+    TFE_EnableCollectiveOps(ctx, serialized.data(), serialized.size(), status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+    // For each worker i, set (keyi, valuei)
+    const std::string& key = tensorflow::strings::StrCat("key", worker_id);
+    TFE_InsertConfigKeyValue(
+        ctx, key.c_str(),
+        tensorflow::strings::StrCat("value", worker_id).c_str(), status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    counter1.DecrementCount();
+    counter1.Wait();
+
+    const int next_id = (worker_id + 1) % cluster_size;
+    // Setting next_key errors out because it has been set by another worker
+    const std::string& next_key = tensorflow::strings::StrCat("key", next_id);
+    TFE_InsertConfigKeyValue(ctx, next_key.c_str(), "some_value", status);
+    EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status)) << TF_Message(status);
+    // Getting next_key returns the value set by another worker
+    TF_Buffer* value_buf = TF_NewBuffer();
+    TFE_GetConfigKeyValue(ctx, next_key.c_str(), value_buf, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    std::string value_str{static_cast<const char*>(value_buf->data),
+                          value_buf->length};
+    EXPECT_EQ(value_str, tensorflow::strings::StrCat("value", next_id));
+    TF_DeleteBuffer(value_buf);
+    counter2.DecrementCount();
+    counter2.Wait();
+
+    // Delete key
+    TFE_DeleteConfigKeyValue(ctx, key.c_str(), status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    counter3.DecrementCount();
+    counter3.Wait();
+
+    TFE_DeleteContext(ctx);
+    TF_DeleteStatus(status);
+  };
+  std::thread thread_worker1([&] { worker_thread_fn(0); });
+  std::thread thread_worker2([&] { worker_thread_fn(1); });
+  std::thread thread_worker3([&] { worker_thread_fn(2); });
+  thread_worker1.join();
+  thread_worker2.join();
+  thread_worker3.join();
+}
+
+TEST(CAPI, MultiClientPropagateError) {
+  const int cluster_size = 3;
+  tensorflow::ServerDef server_def =
+      GetMultiClientServerDef("worker", cluster_size);
+  EnableCoordinationService(&server_def);
+  // Barrier for initializing the cluster.
+  tensorflow::BlockingCounter counter1(cluster_size);
+  // Barrier for finishing executing operations on all workers.
+  tensorflow::BlockingCounter counter2(cluster_size);
+
+  auto worker_thread_fn = [&](int worker_id) {
+    tensorflow::ServerDef server_def_copy = server_def;
+    // By default, server_def has task index set to 0.
+    server_def_copy.set_task_index(worker_id);
+    std::string serialized = server_def_copy.SerializeAsString();
+
+    TF_Status* status = TF_NewStatus();
+    TFE_ContextOptions* opts = TFE_NewContextOptions();
+    TFE_ContextOptionsSetAsync(opts,
+                               static_cast<unsigned char>(/*enable=*/false));
+    TFE_ContextOptionsSetDevicePlacementPolicy(opts,
+                                               TFE_DEVICE_PLACEMENT_SILENT);
+    TFE_Context* ctx = TFE_NewContext(opts, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_DeleteContextOptions(opts);
+
+    TFE_EnableCollectiveOps(ctx, serialized.data(), serialized.size(), status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    counter1.DecrementCount();
+    counter1.Wait();
+
+    // Set error from worker/1
+    if (worker_id == 1) {
+      TFE_ReportErrorToCluster(ctx, TF_INVALID_ARGUMENT, "my_error", status);
+      EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    }
+
+    // Run collective on all workers. The collective will not finish because
+    // worker/1 already in error status. Check that all workers get the same
+    // error reported from running the collective ops.
+    TFE_TensorHandle* in = TestMatrixTensorHandle(ctx);
+    TFE_Op* allreduce = AllReduceOp(ctx, in, cluster_size);
+    TFE_TensorHandle* retvals[1];
+    int num_retvals = 1;
+    TFE_Execute(allreduce, &retvals[0], &num_retvals, status);
+    EXPECT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status)) << TF_Message(status);
+
+    TFE_DeleteTensorHandle(in);
+    TFE_DeleteTensorHandle(retvals[0]);
+    TFE_DeleteOp(allreduce);
+    counter2.DecrementCount();
+    counter2.Wait();
+
+    TFE_DeleteContext(ctx);
+    TF_DeleteStatus(status);
+  };
+  std::thread thread_worker1([&] { worker_thread_fn(0); });
+  std::thread thread_worker2([&] { worker_thread_fn(1); });
+  std::thread thread_worker3([&] { worker_thread_fn(2); });
+  thread_worker1.join();
+  thread_worker2.join();
+  thread_worker3.join();
+}
+
+TEST(CAPI, SingleClientSetGetConfigInOp) {
+  tensorflow::ServerDef server_def = GetServerDef("worker", 3);
+  const char task0_name[] = "/job:worker/replica:0/task:0/device:CPU:0";
+  const char task1_name[] = "/job:worker/replica:0/task:1/device:CPU:0";
+  const char task2_name[] = "/job:worker/replica:0/task:2/device:CPU:0";
+
+  EnableCoordinationService(&server_def);
+  // Add localhost job for the remote client task
+  auto cluster = server_def.mutable_cluster();
+  auto client_job = cluster->add_job();
+  client_job->set_name("localhost");
+  const int client_port = tensorflow::testing::PickUnusedPortOrDie();
+  client_job->mutable_tasks()->insert(
+      {0, strings::StrCat("localhost:", client_port)});
+  server_def.set_job_name("localhost");
+  server_def.mutable_default_session_config()
+      ->mutable_experimental()
+      ->mutable_coordination_config()
+      ->set_service_leader(task0_name);
+  string serialized = server_def.SerializeAsString();
+
+  ServerFactory* factory;
+  ASSERT_TRUE(ServerFactory::GetFactory(server_def, &factory).ok());
+  server_def.set_job_name("worker");
+  server_def.set_task_index(0);
+  std::unique_ptr<tensorflow::ServerInterface> w0;
+  ASSERT_TRUE(
+      factory->NewServer(server_def, ServerFactory::Options(), &w0).ok());
+  ASSERT_TRUE(w0->Start().ok());
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::ServerInterface> w1;
+  ASSERT_TRUE(
+      factory->NewServer(server_def, ServerFactory::Options(), &w1).ok());
+  ASSERT_TRUE(w1->Start().ok());
+  server_def.set_task_index(2);
+  std::unique_ptr<tensorflow::ServerInterface> w2;
+  ASSERT_TRUE(
+      factory->NewServer(server_def, ServerFactory::Options(), &w2).ok());
+  ASSERT_TRUE(w2->Start().ok());
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(/*enable=*/true));
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  TFE_Op* set_op = TFE_NewOp(ctx, "TestSetConfigKeyValue", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_TensorHandle* set_key = TestScalarTensorHandle(ctx, tstring("test_key"));
+  TFE_OpAddInput(set_op, set_key, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_TensorHandle* set_val = TestScalarTensorHandle(ctx, tstring("test_val"));
+  TFE_OpAddInput(set_op, set_val, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  // Run set op from task1
+  TFE_OpSetDevice(set_op, task1_name, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  int num_retvals = 0;
+  TFE_Execute(set_op, nullptr, &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteTensorHandle(set_key);
+  TFE_DeleteTensorHandle(set_val);
+  TFE_DeleteOp(set_op);
+
+  TFE_Op* get_op = TFE_NewOp(ctx, "TestGetConfigKeyValue", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_TensorHandle* get_key = TestScalarTensorHandle(ctx, tstring("test_key"));
+  TFE_OpAddInput(get_op, get_key, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_TensorHandle* retvals[1];
+  num_retvals = 1;
+  // Run get op from task2
+  TFE_OpSetDevice(get_op, task2_name, status);
+  TFE_Execute(get_op, retvals, &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  const tstring& get_val = *static_cast<tstring*>(TF_TensorData(t));
+  EXPECT_EQ(get_val, "test_val")
+      << strings::StrCat("Expecting value test_val but got ", get_val);
+  TFE_DeleteTensorHandle(get_key);
+  TFE_DeleteTensorHandle(retvals[0]);
+  TF_DeleteTensor(t);
+  TFE_DeleteOp(get_op);
+
+  const string& set_fdef = SetConfigKeyValueFn();
+  TFE_ContextAddFunctionDef(ctx, set_fdef.data(), set_fdef.size(), status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_Op* set_fn = TFE_NewOp(ctx, "SetConfigKeyValueFn", status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  set_key = TestScalarTensorHandle(ctx, tstring("test_fn_key"));
+  TFE_OpAddInput(set_fn, set_key, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  set_val = TestScalarTensorHandle(ctx, tstring("test_fn_val"));
+  TFE_OpAddInput(set_fn, set_val, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  // Run set fn on task2
+  TFE_OpSetDevice(set_fn, task2_name, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  num_retvals = 0;
+  TFE_Execute(set_fn, nullptr, &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteTensorHandle(set_key);
+  TFE_DeleteTensorHandle(set_val);
+  TFE_DeleteOp(set_fn);
+
+  const string& get_fdef = GetConfigKeyValueFn();
+  TFE_ContextAddFunctionDef(ctx, get_fdef.data(), get_fdef.size(), status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_Op* get_fn = TFE_NewOp(ctx, "GetConfigKeyValueFn", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  get_key = TestScalarTensorHandle(ctx, tstring("test_fn_key"));
+  TFE_OpAddInput(get_fn, get_key, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_TensorHandle* fn_retvals[1];
+  num_retvals = 1;
+  // Run get fn on task1
+  TFE_OpSetDevice(get_fn, task2_name, status);
+  TFE_Execute(get_fn, fn_retvals, &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  t = TFE_TensorHandleResolve(fn_retvals[0], status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  const tstring& get_fn_val = *static_cast<tstring*>(TF_TensorData(t));
+  EXPECT_EQ(get_fn_val, "test_fn_val")
+      << strings::StrCat("Expecting value test_fn_val but got ", get_fn_val);
+  TFE_DeleteTensorHandle(get_key);
+  TFE_DeleteTensorHandle(fn_retvals[0]);
+  TF_DeleteTensor(t);
+  TFE_DeleteOp(get_fn);
+
+  // Since we created async executor, op status is eventually reported at
+  // the sync barrier.
+  TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+  TFE_ExecutorWaitForAllPendingNodes(executor, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
+  TFE_DeleteExecutor(executor);
+  TFE_DeleteContext(ctx);
+
+  // Grpc servers do not support clean down.
+  w0.release();
+  w1.release();
+  w2.release();
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_function_test.cc b/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_function_test.cc
new file mode 100644
index 00000000000000..0514e3cb2eb2a4
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_function_test.cc
@@ -0,0 +1,384 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_op_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/platform/blocking_counter.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
+#include "tensorflow/core/protobuf/coordination_config.pb.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+
+namespace {
+
+std::string SendFunction(const std::string& send_device,
+                         const std::string& recv_device,
+                         const tensorflow::int64 send_device_incarnation) {
+  tensorflow::FunctionDef def;
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
+      absl::StrCat("    signature {"
+                   "      name: 'SendFunction'"
+                   "      input_arg {"
+                   "        name: 'in'"
+                   "        type: DT_FLOAT"
+                   "      }"
+                   "      control_output: 'send_tensor'"
+                   "    }"
+                   "    node_def {"
+                   "      name: 'send'"
+                   "      op: '_Send'"
+                   "      input: 'in'"
+                   "      device: '",
+                   send_device, "'",
+                   "      attr {"
+                   "        key: 'T'"
+                   "        value {"
+                   "          type: DT_FLOAT"
+                   "        }"
+                   "      }"
+                   "      attr {"
+                   "        key: 'tensor_name'"
+                   "        value {"
+                   "          s: 'dummy'"
+                   "        }"
+                   "      }"
+                   "      attr {"
+                   "        key: 'send_device'"
+                   "        value {"
+                   "          s: '",
+                   send_device, "'",
+                   "        }"
+                   "      }"
+                   "      attr {"
+                   "        key: 'recv_device'"
+                   "        value {"
+                   "          s: '",
+                   recv_device, "'",
+                   "        }"
+                   "      }"
+                   "      attr {"
+                   "        key: 'send_device_incarnation'"
+                   "        value {"
+                   "          i: ",
+                   absl::StrCat(send_device_incarnation),
+                   "        }"
+                   "      }"
+                   "    }"
+                   "    control_ret {"
+                   "      key: 'send_tensor'"
+                   "      value: 'send'"
+                   "    }"),
+      &def));
+  return def.SerializeAsString();
+}
+
+std::string RecvFunction(const std::string& send_device,
+                         const std::string& recv_device,
+                         const tensorflow::int64 send_device_incarnation) {
+  tensorflow::FunctionDef def;
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
+      absl::StrCat("    signature {"
+                   "      name: 'RecvFunction'"
+                   "      output_arg {"
+                   "        name: 'out'"
+                   "        type: DT_FLOAT"
+                   "      }"
+                   "    }"
+                   "    node_def {"
+                   "      name: 'recv'"
+                   "      op: '_Recv'"
+                   "      device: '",
+                   recv_device, "'",
+                   "      attr {"
+                   "        key: 'tensor_type'"
+                   "        value {"
+                   "          type: DT_FLOAT"
+                   "        }"
+                   "      }"
+                   "      attr {"
+                   "        key: 'tensor_name'"
+                   "        value {"
+                   "          s: 'dummy'"
+                   "        }"
+                   "      }"
+                   "      attr {"
+                   "        key: 'send_device'"
+                   "        value {"
+                   "          s: '",
+                   send_device, "'",
+                   "        }"
+                   "      }"
+                   "      attr {"
+                   "        key: 'recv_device'"
+                   "        value {"
+                   "          s: '",
+                   recv_device, "'",
+                   "        }"
+                   "      }"
+                   "      attr {"
+                   "        key: 'send_device_incarnation'"
+                   "        value {"
+                   "          i: ",
+                   absl::StrCat(send_device_incarnation),
+                   "        }"
+                   "      }"
+                   "    }"
+                   "    ret {"
+                   "      key: 'out'"
+                   "      value: 'recv:tensor'"
+                   "    }"),
+      &def));
+  return def.SerializeAsString();
+}
+
+TFE_TensorHandle* DummyTensorHandleWithValue(TFE_Context* ctx, float v) {
+  // Initialize matrix values.
+  int64_t dims[] = {2, 2};
+  float data[4];
+  for (int i = 0; i < 4; i++) {
+    data[i] = v * (i + 1);
+  }
+
+  return TestTensorHandleWithDimsFloat(ctx, data, &dims[0],
+                                       sizeof(dims) / sizeof(int64_t));
+}
+
+struct MultiClientSendRecvTestParams {
+  std::string test_name;
+  bool use_tfrt = false;
+  uint num_steps = 1;
+  uint delay_recv_sec = 0;
+  uint delay_send_sec = 0;
+};
+
+using MultiClientSendRecvTest =
+    testing::TestWithParam<MultiClientSendRecvTestParams>;
+
+TEST_P(MultiClientSendRecvTest, TestMultiClientSendRecv) {
+  const MultiClientSendRecvTestParams& params = GetParam();
+  // Use a mutex to enforce a serialized operation between the two
+  // worker-threads since some of their operations involve updating the global
+  // singleton instances (in TFRT scenario), which otherwise would cause a data
+  // race.
+  tensorflow::mutex mu;
+
+  const int cluster_size = 2;
+  tensorflow::ServerDef server_def =
+      GetMultiClientServerDef("worker", cluster_size);
+
+  // Enable coordination service for propagating remote device attributess
+  auto* coord_config = server_def.mutable_default_session_config()
+                           ->mutable_experimental()
+                           ->mutable_coordination_config();
+  coord_config->set_service_type("standalone");
+  coord_config->set_service_leader("/job:worker/replica:0/task:0");
+
+  // The blocking counter makes sure that worker/0 thread (leader that starts
+  // the coordination service) does not exit early while other workers are still
+  // interacting with the coordination service.
+  tensorflow::BlockingCounter counter(cluster_size);
+
+  auto worker_thread_fn = [&](int worker_id) {
+    tensorflow::ServerDef server_def_copy = server_def;
+    server_def_copy.set_task_index(worker_id);
+    std::string serialized = server_def_copy.SerializeAsString();
+
+    TF_Status* status = TF_NewStatus();
+    TFE_ContextOptions* context_opts = TFE_NewContextOptions();
+    TFE_ContextOptionsSetAsync(context_opts,
+                               static_cast<unsigned char>(/*enable=*/true));
+    TFE_ContextOptionsSetDevicePlacementPolicy(context_opts,
+                                               TFE_DEVICE_PLACEMENT_SILENT);
+    // use-tfrt flag.
+    context_opts->use_tfrt = params.use_tfrt;
+    tensorflow::SessionOptions session_opts;
+    session_opts.config = server_def_copy.default_session_config();
+    context_opts->session_options.options = session_opts;
+
+    TFE_Context* ctx;
+    {
+      tensorflow::mutex_lock l(mu);
+      ctx = TFE_NewContext(context_opts, status);
+    }
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_DeleteContextOptions(context_opts);
+
+    TFE_EnableCollectiveOps(ctx, serialized.data(), serialized.size(), status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+    const std::string& send_device =
+        "/job:worker/replica:0/task:0/device:CPU:0";
+    const std::string& recv_device =
+        "/job:worker/replica:0/task:1/device:CPU:0";
+
+    std::vector<tensorflow::DeviceAttributes> device_attrs;
+    tensorflow::unwrap(ctx)->ListDevices(&device_attrs);
+    tensorflow::uint64 send_device_incarnation = 0;
+    for (const auto& device_attr : device_attrs) {
+      if (device_attr.name() == send_device) {
+        send_device_incarnation = device_attr.incarnation();
+        break;
+      }
+    }
+
+    if (worker_id == 0) {
+      // Sender worker.
+      tensorflow::Env::Default()->SleepForMicroseconds(params.delay_send_sec *
+                                                       1000);
+
+      const std::string& fdef =
+          SendFunction(send_device, recv_device, send_device_incarnation);
+      TFE_ContextAddFunctionDef(ctx, fdef.data(), fdef.size(), status);
+      EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+      // Run multiple steps.
+      for (int s = 1; s <= params.num_steps; s++) {
+        TFE_Op* send_func = TFE_NewOp(ctx, "SendFunction", status);
+        EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+        if (params.use_tfrt) {
+          // TODO (@chienchunh): Add support for step id configuration in TFRT.
+          EXPECT_TRUE(tensorflow::unwrap(send_func)
+                          ->Reset("SendFunction", send_device.c_str())
+                          .ok());
+        } else {
+          tensorflow::EagerOperation* op =
+              tensorflow::OperationFromInterface(tensorflow::unwrap(send_func));
+          EXPECT_TRUE(op->Reset("SendFunction", send_device.c_str(),
+                                /*remote=*/false, /*executor=*/nullptr,
+                                tensorflow::EagerFunctionParams{/*op_id=*/s,
+                                                                /*step_id=*/s})
+                          .ok());
+        }
+
+        TFE_TensorHandle* in = DummyTensorHandleWithValue(ctx, 1.0f * s);
+        TFE_OpAddInput(send_func, in, status);
+        EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+        int num_retvals = 0;
+        {
+          tensorflow::mutex_lock l(mu);
+          TFE_Execute(send_func, nullptr, &num_retvals, status);
+        }
+        EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+        TFE_DeleteOp(send_func);
+        TFE_DeleteTensorHandle(in);
+      }
+    } else {
+      // Receiver worker.
+      tensorflow::Env::Default()->SleepForMicroseconds(params.delay_recv_sec *
+                                                       1000);
+
+      const std::string& fdef =
+          RecvFunction(send_device, recv_device, send_device_incarnation);
+      TFE_ContextAddFunctionDef(ctx, fdef.data(), fdef.size(), status);
+      EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+      // Run multiple steps.
+      for (int s = 1; s <= params.num_steps; s++) {
+        TFE_Op* recv_func = TFE_NewOp(ctx, "RecvFunction", status);
+        EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+        if (params.use_tfrt) {
+          // TODO (@chienchunh): Add support for step id configuration in TFRT.
+          EXPECT_TRUE(tensorflow::unwrap(recv_func)
+                          ->Reset("RecvFunction", recv_device.c_str())
+                          .ok());
+        } else {
+          tensorflow::EagerOperation* op =
+              tensorflow::OperationFromInterface(tensorflow::unwrap(recv_func));
+          EXPECT_TRUE(op->Reset("RecvFunction", recv_device.c_str(),
+                                /*remote=*/false, /*executor=*/nullptr,
+                                tensorflow::EagerFunctionParams{/*op_id=*/s,
+                                                                /*step_id=*/s})
+                          .ok());
+        }
+
+        TFE_TensorHandle* retvals[1];
+        int num_retvals = 1;
+        {
+          tensorflow::mutex_lock l(mu);
+          TFE_Execute(recv_func, &retvals[0], &num_retvals, status);
+        }
+        EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+        TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+        TFE_DeleteOp(recv_func);
+        TFE_DeleteTensorHandle(retvals[0]);
+
+        float result[4] = {0};
+        EXPECT_EQ(sizeof(result), TF_TensorByteSize(t));
+        memcpy(&result[0], TF_TensorData(t), TF_TensorByteSize(t));
+        TF_DeleteTensor(t);
+        for (int i = 0; i < 4; i++) {
+          EXPECT_EQ(result[i], 1.0 * s * (i + 1));
+        }
+      }
+    }
+
+    // To make sure the sender won't delete the data it sent before the receiver
+    // retrieves it, we need to do the following steps:
+    // 1. Since we created async EagerContext, we need to force each worker to
+    //    wait until all pening operations finish before deleting the context.
+    // 2. In addition, use the blocking counter to notify the 2 workers when
+    //    it is safe to clean up all the data.
+    TFE_ContextAsyncWait(ctx, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    counter.DecrementCount();
+    counter.Wait();
+
+    {
+      tensorflow::mutex_lock l(mu);
+      TFE_DeleteContext(ctx);
+    }
+    TF_DeleteStatus(status);
+  };
+
+  std::thread thread_worker1([&] { worker_thread_fn(0); });
+  std::thread thread_worker2([&] { worker_thread_fn(1); });
+
+  thread_worker1.join();
+  thread_worker2.join();
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    MultiClientSendRecvTests, MultiClientSendRecvTest,
+    testing::ValuesIn<MultiClientSendRecvTestParams>({
+        {"MultiClientSingleStepFunction", false, 1, 0, 0},
+        {"MultiClientMultiStepFunction", false, 3, 0, 0},
+        {"MultiClientMultiStepFunctionWithRecvDelay", false, 5, 2, 0},
+        {"MultiClientMultiStepFunctionWithSendDelay", false, 5, 0, 2},
+        {"MultiClientSingleStepFunctionTfrt", true, 1, 0, 0},
+        {"MultiClientMultiStepFunctionTfrt", true, 3, 0, 0},
+        {"MultiClientMultiStepFunctionWithRecvDelayTfrt", true, 5, 2, 0},
+        {"MultiClientMultiStepFunctionWithSendDelayTfrt", true, 5, 0, 2},
+    }),
+    [](const testing::TestParamInfo<MultiClientSendRecvTest::ParamType>& info) {
+      return info.param.test_name;
+    });
+
+}  // namespace
diff --git a/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_test.cc b/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_test.cc
new file mode 100644
index 00000000000000..db844b39f56791
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_test.cc
@@ -0,0 +1,222 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/c/eager/tfe_context_internal.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/platform/blocking_counter.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
+#include "tensorflow/core/protobuf/coordination_config.pb.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+
+namespace {
+
+void StartWorkers(int cluster_size,
+                  std::function<void(TFE_Context* ctx, TF_Status* status,
+                                     int worker_id, int cluster_size)>
+                      fn) {
+  tensorflow::ServerDef server_def =
+      GetMultiClientServerDef("worker", cluster_size, /*num_virtual_gpus=*/2);
+  // Enable coordination service for propagating remote device attributess
+  auto* config = server_def.mutable_default_session_config()
+                     ->mutable_experimental()
+                     ->mutable_coordination_config();
+  config->set_service_type("standalone");
+  config->set_service_leader("/job:worker/replica:0/task:0");
+
+  // The blocking counter makes sure that worker/0 thread (leader that starts
+  // the coordination service) does not exit early while other workers are still
+  // interacting with the coordination service.
+  tensorflow::BlockingCounter counter(cluster_size);
+  auto worker_thread_fn = [&](int worker_id) {
+    tensorflow::ServerDef server_def_copy = server_def;
+    // By default, server_def has task index set to 0.
+    server_def_copy.set_task_index(worker_id);
+    std::string serialized = server_def_copy.SerializeAsString();
+
+    TF_Status* status = TF_NewStatus();
+    TFE_ContextOptions* opts = TFE_NewContextOptions();
+    TFE_ContextOptionsSetAsync(opts,
+                               static_cast<unsigned char>(/*enable=*/true));
+    TFE_ContextOptionsSetDevicePlacementPolicy(opts,
+                                               TFE_DEVICE_PLACEMENT_SILENT);
+
+    tensorflow::SessionOptions options;
+    options.config = server_def_copy.default_session_config();
+    opts->session_options.options = options;
+    TFE_Context* ctx = TFE_NewContext(opts, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_DeleteContextOptions(opts);
+
+    TFE_EnableCollectiveOps(ctx, serialized.data(), serialized.size(), status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+    fn(ctx, status, worker_id, cluster_size);
+    counter.DecrementCount();
+    counter.Wait();
+
+    // Since we created an async EagerContext, wait for all pending operations
+    // to finish before deleting the context.
+    TFE_Executor* executor = TFE_ContextGetExecutorForThread(ctx);
+    TFE_ExecutorWaitForAllPendingNodes(executor, status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_DeleteExecutor(executor);
+
+    TFE_DeleteContext(ctx);
+    TF_DeleteStatus(status);
+  };
+
+  std::vector<std::thread> worker_threads;
+  for (int i = 0; i < cluster_size; ++i) {
+    worker_threads.emplace_back([i, worker_thread_fn] { worker_thread_fn(i); });
+  }
+  for (auto i = 0; i < cluster_size; ++i) {
+    worker_threads[i].join();
+  }
+}
+
+TEST(CAPI, MultiClientCollectiveOps) {
+  auto fn = [](TFE_Context* ctx, TF_Status* status, int worker_id,
+               int cluster_size) {
+    TFE_TensorHandle* in = TestMatrixTensorHandle(ctx);
+    TFE_Op* allreduce = AllReduceOp(ctx, in, cluster_size);
+    TFE_TensorHandle* retvals[1];
+    int num_retvals = 1;
+    TFE_Execute(allreduce, &retvals[0], &num_retvals, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+    TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    float result[4] = {0};
+    EXPECT_EQ(sizeof(result), TF_TensorByteSize(t));
+    memcpy(&result[0], TF_TensorData(t), TF_TensorByteSize(t));
+    TF_DeleteTensor(t);
+    EXPECT_EQ(2.0, result[0]);
+    EXPECT_EQ(4.0, result[1]);
+    EXPECT_EQ(6.0, result[2]);
+    EXPECT_EQ(8.0, result[3]);
+
+    TFE_DeleteTensorHandle(in);
+    TFE_DeleteTensorHandle(retvals[0]);
+    TFE_DeleteOp(allreduce);
+  };
+  StartWorkers(2, fn);
+}
+
+TEST(CAPI, MultiClientRemoteDevices) {
+  auto fn = [](TFE_Context* ctx, TF_Status* status, int worker_id,
+               int cluster_size) {
+    std::vector<tensorflow::DeviceAttributes> device_attrs;
+    tensorflow::EagerContext* context =
+        tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
+    context->ListDevices(&device_attrs);
+    std::vector<std::string> device_names;
+    for (const auto& device_attr : device_attrs) {
+      device_names.push_back(device_attr.name());
+    }
+
+    bool has_gpu_devices = false;
+    std::string unused_gpu_device_name;
+    if (GetDeviceName(ctx, &unused_gpu_device_name, "GPU")) {
+      has_gpu_devices = true;
+    }
+
+    std::vector<std::string> expected_device_names;
+    for (int i = 0; i < cluster_size; ++i) {
+      expected_device_names.push_back(tensorflow::strings::StrCat(
+          "/job:worker/replica:0/task:", i, "/device:CPU:0"));
+      if (has_gpu_devices) {
+        expected_device_names.push_back(tensorflow::strings::StrCat(
+            "/job:worker/replica:0/task:", i, "/device:GPU:0"));
+        expected_device_names.push_back(tensorflow::strings::StrCat(
+            "/job:worker/replica:0/task:", i, "/device:GPU:1"));
+      }
+    }
+
+    EXPECT_THAT(device_names,
+                testing::UnorderedElementsAreArray(expected_device_names));
+  };
+  StartWorkers(3, fn);
+}
+
+TEST(CAPI, MultiClientSendRecv) {
+  auto fn = [](TFE_Context* ctx, TF_Status* status, int worker_id,
+               int cluster_size) {
+    // Test with GPUs if present (based on test configuration) and CPUs
+    // otherwise.
+    auto send_device = "/job:worker/replica:0/task:0/device:CPU:0";
+    auto recv_device = "/job:worker/replica:0/task:1/device:CPU:0";
+    std::string unused_gpu_device_name;
+    if (GetDeviceName(ctx, &unused_gpu_device_name, "GPU")) {
+      send_device = "/job:worker/replica:0/task:0/device:GPU:0";
+      recv_device = "/job:worker/replica:0/task:1/device:GPU:0";
+    }
+
+    std::vector<tensorflow::DeviceAttributes> device_attrs;
+    tensorflow::EagerContext* context =
+        tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
+    context->ListDevices(&device_attrs);
+
+    tensorflow::uint64 send_device_incarnation = 0;
+    for (const auto& device_attr : device_attrs) {
+      if (device_attr.name() == send_device) {
+        send_device_incarnation = device_attr.incarnation();
+        break;
+      }
+    }
+
+    if (worker_id == 0) {
+      TFE_TensorHandle* in = TestMatrixTensorHandle(ctx);
+      const std::string& op_name =
+          tensorflow::str_util::StrContains(send_device, "GPU") ? "Send"
+                                                                : "_HostSend";
+      TFE_Op* sendop = SendOp(ctx, in, op_name, send_device, recv_device,
+                              send_device_incarnation);
+      TFE_TensorHandle* retvals[1];
+      int num_retvals = 1;
+      TFE_Execute(sendop, &retvals[0], &num_retvals, status);
+      EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+      TFE_DeleteOp(sendop);
+      TFE_DeleteTensorHandle(in);
+    } else {
+      const std::string& op_name =
+          tensorflow::str_util::StrContains(send_device, "GPU") ? "Recv"
+                                                                : "_HostRecv";
+      TFE_Op* recvop = RecvOp(ctx, op_name, send_device, recv_device,
+                              send_device_incarnation);
+      TFE_TensorHandle* retvals[1];
+      int num_retvals = 1;
+      TFE_Execute(recvop, &retvals[0], &num_retvals, status);
+      TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+      TF_DeleteTensor(t);
+      EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+      TFE_DeleteTensorHandle(retvals[0]);
+      TFE_DeleteOp(recvop);
+    }
+  };
+  StartWorkers(2, fn);
+}
+
+}  // namespace
diff --git a/tensorflow/core/distributed_runtime/integration_test/c_api_session_coordination_test.cc b/tensorflow/core/distributed_runtime/integration_test/c_api_session_coordination_test.cc
new file mode 100644
index 00000000000000..a9164c37dbb794
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/integration_test/c_api_session_coordination_test.cc
@@ -0,0 +1,189 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_experimental.h"
+#include "tensorflow/c/c_test_util.h"
+#include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/core/distributed_runtime/coordination/coordination_service.h"
+#include "tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/coordination_config.pb.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+
+namespace tensorflow {
+namespace {
+
+constexpr char kCoordinationServiceType[] = "standalone";
+
+void EnableCoordinationService(tensorflow::ServerDef* server_def) {
+  auto coord_config = server_def->mutable_default_session_config()
+                          ->mutable_experimental()
+                          ->mutable_coordination_config();
+  coord_config->set_service_type(kCoordinationServiceType);
+  coord_config->set_service_leader("/job:worker/replica:0/task:0");
+}
+
+struct SessionParams {
+  std::string test_name;
+  bool enable_clusterspec_propagation = false;
+};
+
+using SingleClientTest = ::testing::TestWithParam<SessionParams>;
+
+TEST_P(SingleClientTest, SetGetConfigInOpTest) {
+  bool enable_clusterspec_propagation =
+      GetParam().enable_clusterspec_propagation;
+  const int num_workers = 3;
+  std::string job_name = "worker";
+  // NOTE(b/37868888#comment4): Set a different initial name for the job due to
+  // the limitation in ClusterSpec propagation.
+  if (enable_clusterspec_propagation) {
+    job_name = "server_init";
+  }
+  tensorflow::ServerDef server_def = GetServerDef(job_name, num_workers);
+  const char task0_name[] = "/job:worker/replica:0/task:0/device:CPU:0";
+  const char task1_name[] = "/job:worker/replica:0/task:1/device:CPU:0";
+  const char task2_name[] = "/job:worker/replica:0/task:2/device:CPU:0";
+  const std::string& master =
+      strings::StrCat("grpc://", server_def.cluster().job(0).tasks().at(0));
+
+  EnableCoordinationService(&server_def);
+  server_def.mutable_default_session_config()
+      ->mutable_experimental()
+      ->mutable_coordination_config()
+      ->set_service_leader(task0_name);
+
+  // Start server instances for the workers.
+  ServerFactory* factory;
+  ASSERT_TRUE(ServerFactory::GetFactory(server_def, &factory).ok());
+  std::unique_ptr<tensorflow::ServerInterface> workers[3];
+  for (int worker_id = 0; worker_id < num_workers; worker_id++) {
+    server_def.set_task_index(worker_id);
+    ASSERT_TRUE(factory
+                    ->NewServer(server_def, ServerFactory::Options(),
+                                &workers[worker_id])
+                    .ok());
+    ASSERT_TRUE(workers[worker_id]->Start().ok());
+  }
+
+  // Build graph with a TestSetConfigKeyValue op on worker/1, and a
+  // TestGetConfigKeyValue on worker/2.
+  TF_Status* status = TF_NewStatus();
+  TF_Graph* graph = TF_NewGraph();
+  TF_Operation* feed_key = Placeholder(graph, status, "key", TF_STRING, {});
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TF_Operation* feed_val = Placeholder(graph, status, "val", TF_STRING, {});
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TF_OperationDescription* set_desc =
+      TF_NewOperation(graph, "TestSetConfigKeyValue", "set");
+  TF_AddInput(set_desc, {feed_key, 0});
+  TF_AddInput(set_desc, {feed_val, 0});
+  TF_SetDevice(set_desc, task1_name);
+  TF_Operation* set_op = TF_FinishOperation(set_desc, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  TF_OperationDescription* get_desc =
+      TF_NewOperation(graph, "TestGetConfigKeyValue", "get");
+  TF_Output get_input = {feed_key, 0};
+  TF_AddInput(get_desc, get_input);
+  // Add control dependency to make sure "get" runs after "set"
+  TF_AddControlInput(get_desc, set_op);
+  TF_SetDevice(get_desc, task2_name);
+  TF_Operation* get_op = TF_FinishOperation(get_desc, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  // Prepare feeds and fetches for running the graph
+  const char test_key[] = "test_key";
+  const char test_val[] = "test_val";
+  TF_TString tstr_key;
+  TF_TString_Init(&tstr_key);
+  TF_TString_Copy(&tstr_key, test_key, sizeof(test_key) - 1);
+  TF_TString tstr_val;
+  TF_TString_Init(&tstr_val);
+  TF_TString_Copy(&tstr_val, test_val, sizeof(test_val) - 1);
+  TF_Output inputs[2] = {{feed_key, 0}, {feed_val, 0}};
+  TF_Tensor* input_values[2];
+  auto deallocator = [](void* data, size_t len, void* arg) {};
+  input_values[0] = TF_NewTensor(TF_STRING, nullptr, 0, &tstr_key,
+                                 sizeof(tstr_key), deallocator, nullptr);
+  input_values[1] = TF_NewTensor(TF_STRING, nullptr, 0, &tstr_val,
+                                 sizeof(tstr_val), deallocator, nullptr);
+  TF_Output outputs[1] = {{get_op, 0}};
+  TF_Tensor* output_values[1] = {nullptr};
+
+  // Create session to run the graph
+  TF_SessionOptions* opts = TF_NewSessionOptions();
+  TF_SetTarget(opts, master.c_str());
+  ConfigProto configs = server_def.default_session_config();
+  if (enable_clusterspec_propagation) {
+    // NOTE(b/37868888#comment4): Reset name of the job due to the limitation in
+    // ClusterSpec propagation.
+    server_def.mutable_cluster()->mutable_job(0)->set_name("worker");
+    *configs.mutable_cluster_def() = server_def.cluster();
+  }
+  const std::string& serialized = configs.SerializeAsString();
+  TF_SetConfig(opts, serialized.data(), serialized.size(), status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TF_Session* sess = TF_NewSession(graph, opts, status);
+  TF_SessionRun(sess, nullptr, inputs, input_values, 2, outputs, output_values,
+                1, nullptr, 0, nullptr, status);
+  // Verify that the test value was set and extracted from the coordination
+  // service correctly.
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  const tstring& output_val =
+      *static_cast<tstring*>(TF_TensorData(output_values[0]));
+  EXPECT_EQ(output_val, test_val);
+
+  // Clean up.
+  TF_CloseSession(sess, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteTensor(input_values[0]);
+  TF_DeleteTensor(input_values[1]);
+  TF_DeleteTensor(output_values[0]);
+  TF_DeleteSessionOptions(opts);
+  TF_DeleteSession(sess, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteGraph(graph);
+  TF_DeleteStatus(status);
+
+  // Grpc servers do not support clean down.
+  for (int worker_id = 0; worker_id < num_workers; worker_id++) {
+    workers[worker_id].release();
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    SessionCoordinationTests, SingleClientTest,
+    ::testing::ValuesIn<SessionParams>({
+        {"EnableClusterSpecPropagation", true},
+        {"DisableClusterSpecPropagation", false},
+    }),
+    [](const ::testing::TestParamInfo<SingleClientTest::ParamType>& info) {
+      return info.param.test_name;
+    });
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc b/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc
new file mode 100644
index 00000000000000..ccc02249477171
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/integration_test/coordination_test_opkernel_registration.cc
@@ -0,0 +1,104 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/coordination/coordination_service_agent.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace {
+
+REGISTER_OP("TestSetConfigKeyValue")
+    .Input("key: string")
+    .Input("value: string")
+    .SetIsStateful()  // side-effective op
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape)
+    .Doc(R"doc(
+Test op setting distributed configs using coordination service.
+)doc");
+
+// Kernel that sets distributed configures using coordination service.
+class TestSetConfigKeyValueOp : public OpKernel {
+ public:
+  explicit TestSetConfigKeyValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* key_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("key", &key_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(key_tensor->shape()),
+                errors::InvalidArgument("Key must be scalar."));
+    const string& config_key = key_tensor->scalar<tstring>()();
+    const Tensor* val_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("value", &val_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(key_tensor->shape()),
+                errors::InvalidArgument("Value must be scalar."));
+    const string& config_value = val_tensor->scalar<tstring>()();
+    LOG(INFO) << "TestSetConfigKeyValueOp key=" << config_key
+              << "value=" << config_value;
+    auto* coord_agent = ctx->coordination_service_agent();
+    if (coord_agent == nullptr || !coord_agent->IsInitialized()) {
+      ctx->SetStatus(
+          errors::Internal("Coordination service agent is not instantiated or "
+                           "initialized properly."));
+      return;
+    }
+    OP_REQUIRES_OK(ctx, coord_agent->InsertKeyValue(config_key, config_value));
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("TestSetConfigKeyValue").Device(DEVICE_DEFAULT),
+                        TestSetConfigKeyValueOp);
+
+REGISTER_OP("TestGetConfigKeyValue")
+    .Input("key: string")
+    .Output("value: string")
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape)
+    .Doc(R"doc(
+Test op getting distributed configs using coordination service.
+)doc");
+
+// Kernel that gets distributed configures using coordination service.
+class TestGetConfigKeyValueOp : public OpKernel {
+ public:
+  explicit TestGetConfigKeyValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* key_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("key", &key_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(key_tensor->shape()),
+                errors::InvalidArgument("Key must be scalar."));
+    const string& config_key = key_tensor->scalar<tstring>()();
+    LOG(INFO) << "TestGetConfigKeyValueOp key=" << config_key;
+
+    auto* coord_agent = ctx->coordination_service_agent();
+    if (coord_agent == nullptr || !coord_agent->IsInitialized()) {
+      ctx->SetStatus(
+          errors::Internal("Coordination service agent is not instantiated or "
+                           "initialized properly."));
+      return;
+    }
+    auto status_or_val = coord_agent->GetKeyValue(config_key);
+    OP_REQUIRES_OK(ctx, status_or_val.status());
+
+    Tensor* val_tensor;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output("value", key_tensor->shape(), &val_tensor));
+    auto value = val_tensor->scalar<tstring>()();
+    val_tensor->scalar<tstring>()() = status_or_val.ValueOrDie();
+  }
+};
+REGISTER_KERNEL_BUILDER(Name("TestGetConfigKeyValue").Device(DEVICE_DEFAULT),
+                        TestGetConfigKeyValueOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h b/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
index ae3a90ff89f86d..bb56f927fc0378 100644
--- a/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
+++ b/tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h
@@ -41,6 +41,12 @@ class RemoteRendezvous : public Rendezvous {
   // Fully construct the RemoteRendezvous.
   virtual Status Initialize(WorkerSession* session) = 0;
 
+  // In remote eager, set current instance as context default rendezvous which
+  // will be used for eager op-by-op execution.
+  virtual void SetRemoteEagerContextDefault() = 0;
+  // In remote eager, get if current instance is context default rendezvous.
+  virtual bool IsRemoteEagerContextDefault() = 0;
+
  protected:
   bool is_cross_process() override { return true; }
 };
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 1ac21ece25295c..07f990a72b6970 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -341,6 +341,7 @@ cc_library(
         "//tensorflow/core/distributed_runtime:session_mgr",
         "//tensorflow/core/distributed_runtime:worker_cache_wrapper",
         "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/distributed_runtime/rpc/coordination:grpc_coordination_service_impl",
         "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_service_impl",
         "//tensorflow/core/profiler/rpc:profiler_service_impl",
     ] + tf_protos_profiler_service() + tf_grpc_dependencies() + tf_grpc_cc_dependencies(),
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index b08c62b83d9c5d..de3980009b8908 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/master_env.h"
 #include "tensorflow/core/distributed_runtime/master_session.h"
 #include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
+#include "tensorflow/core/distributed_runtime/rpc/coordination/grpc_coordination_service_impl.h"
 #include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_master_service.h"
@@ -253,6 +254,8 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
                                          opts.worker_service_options)
                         .release();
   eager_service_ = new eager::GrpcEagerServiceImpl(&worker_env_, &builder);
+  coordination_service_ =
+      new GrpcCoordinationServiceImpl(&worker_env_, &builder);
 
   profiler_service_ = profiler::CreateProfilerService();
   builder.RegisterService(profiler_service_.get());
@@ -411,6 +414,9 @@ Status GrpcServer::Start() {
       eager_thread_.reset(
           env_->StartThread(ThreadOptions(), "TF_eager_service",
                             [this] { eager_service_->HandleRPCsLoop(); }));
+      coordination_thread_.reset(env_->StartThread(
+          ThreadOptions(), "TF_coordination_service",
+          [this] { coordination_service_->HandleRPCsLoop(); }));
 
       for (const auto& kv : extra_services_) {
         const std::string& service_name = kv.first;
@@ -479,7 +485,9 @@ Status GrpcServer::UpdateServerDef(const ServerDef& server_def) {
 // field inside the RPC coordination service handler.
 Status GrpcServer::SetCoordinationServiceAgentInstance(
     CoordinationServiceAgent* agent) {
-  // No op, coordination service is not implemented in open source.
+  auto* coord_service =
+      static_cast<GrpcCoordinationServiceImpl*>(coordination_service_);
+  coord_service->SetCoordinationServiceAgentInstance(agent);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 0b22d16155ce31..daafdb127676b3 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -214,6 +214,10 @@ class GrpcServer : public ServerInterface {
   std::unique_ptr<Thread> eager_thread_ TF_GUARDED_BY(mu_);
   std::shared_ptr<WorkerSession> worker_session_;
 
+  // Experimental coordination service implementation, and RPC polling thread.
+  AsyncServiceInterface* coordination_service_ = nullptr;
+  std::unique_ptr<Thread> coordination_thread_ TF_GUARDED_BY(mu_);
+
   // TensorFlow profiler service implementation.
   std::unique_ptr<grpc::ProfilerService::Service> profiler_service_ = nullptr;
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
index ce85b2f7b42862..d89ecda10811ce 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
@@ -83,14 +83,9 @@ class GrpcWorkerCache : public WorkerCachePartial {
 
   Status GetCoordinationClientCache(std::unique_ptr<CoordinationClientCache>*
                                         coordination_client_cache) override {
-#if defined(PLATFORM_GOOGLE)
     coordination_client_cache->reset(
         NewGrpcCoordinationClientCache(channel_cache_));
     return Status::OK();
-#else
-    return errors::Unimplemented(
-        "Coordination service in open source is not yet implemented.");
-#endif
   }
 
   void SetLogging(bool v) override { logger_.SetLogging(v); }
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 72a3fbb44bd1d5..c71fb32c3b23ae 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -221,8 +221,7 @@ void SessionMgr::ResetDefaultWorkerCache(WorkerCacheInterface* worker_cache) {
 Status SessionMgr::UpdateSession(
     const string& session, const ServerDef& server_def,
     const protobuf::RepeatedPtrField<DeviceAttributes>&
-        cluster_device_attributes,
-    bool isolate_session_state) {
+        cluster_device_attributes) {
   mutex_lock l(mu_);
   if (session.empty()) {
     return errors::InvalidArgument("Session must be non-empty.");
diff --git a/tensorflow/core/distributed_runtime/session_mgr.h b/tensorflow/core/distributed_runtime/session_mgr.h
index fc6e8762075ac5..b4a0d2045a08f1 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.h
+++ b/tensorflow/core/distributed_runtime/session_mgr.h
@@ -77,8 +77,7 @@ class SessionMgr {
   // session name (`session`) based on a new server_def and set of devices.
   Status UpdateSession(const string& session, const ServerDef& server_def,
                        const protobuf::RepeatedPtrField<DeviceAttributes>&
-                           cluster_device_attributes,
-                       bool isolate_session_state);
+                           cluster_device_attributes);
 
   // Locates the worker session for a given session handle
   Status WorkerSessionForSession(const string& session_handle,
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 3991657ebe1260..ee8a9b91f20303 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -110,7 +110,9 @@ void Worker::DeregisterGraphAsync(const DeregisterGraphRequest* request,
 }
 
 void Worker::AbortStep(int64_t step_id) {
-  Rendezvous* rendez = env_->rendezvous_mgr->Find(step_id);
+  RemoteRendezvous* rendez = env_->rendezvous_mgr->Find(step_id);
+  // Do not abort if it's a context global instance for eager op-by-op execution
+  if (rendez->IsRemoteEagerContextDefault()) return;
   SchedNonBlockingClosureAfter(1000000, [rendez, step_id]() {
     // Delay a bit before aborting the step. This way, the root
     // cause may return first back to the client instead of this
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index 3dd93fd9028665..2bdb65e29bc774 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -1059,6 +1059,7 @@ cc_library(
         ":op_def_proto_cc",
         ":tensor",
         ":types_proto_cc",
+        "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
     ],
@@ -1288,6 +1289,7 @@ tf_cc_tests(
         "device_base_test.cc",
         "disable_jit_test.cc",
         "full_type_inference_util_test.cc",
+        "full_type_util_test.cc",
         "function_test.cc",
         "graph_def_util_test.cc",
         "graph_to_functiondef_test.cc",
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index 6165984f3fd538..79c89a64d0c61b 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -621,7 +621,9 @@ Status DatasetBase::ComputeNumSources() {
 }
 
 Status DatasetBase::CheckRandomAccessCompatible(const int64 index) const {
-  int64 cardinality = Cardinality();
+  CardinalityOptions options;
+  options.set_compute_level(CardinalityOptions::CARDINALITY_COMPUTE_MODERATE);
+  int64 cardinality = Cardinality(options);
   if (cardinality == kInfiniteCardinality ||
       cardinality == kUnknownCardinality) {
     return tensorflow::errors::FailedPrecondition(
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 251c3ef4a85136..36db2390e3d9f9 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -983,14 +983,19 @@ class DatasetBase : public core::RefCounted {
   // Returns the estimated number of bytes used for tensors of this dataset.
   virtual int64_t TotalBytes() const { return 0; }
 
-  // Returns the cardinality of this dataset. This should be removed once
-  // all callers are migrated to use Cardinality(CardinalityOptions).
+  // Returns the cardinality of this dataset.
+  // TODO(shilpakrish): Remove this overload once all callers are migrated
+  // to the API which passes in the options parameter.
+  ABSL_DEPRECATED("Use the overload that passes in the options parameter.")
   int64_t Cardinality() const;
 
   // Returns the cardinality of this dataset based on the options.
   int64_t Cardinality(CardinalityOptions options) const;
 
   // Internal implementation of cardinality for a dataset.
+  // TODO(shilpakrish): Remove this overload once all callers are migrated
+  // to the API which passes in the options parameter.
+  ABSL_DEPRECATED("Use the overload that passes in the options parameter.")
   virtual int64_t CardinalityInternal() const { return kUnknownCardinality; }
 
   // Internal implementation of cardinality for a dataset based on the options.
diff --git a/tensorflow/core/framework/full_type_util.cc b/tensorflow/core/framework/full_type_util.cc
index 89617dc97f2496..e1efbc3c9eeed8 100644
--- a/tensorflow/core/framework/full_type_util.cc
+++ b/tensorflow/core/framework/full_type_util.cc
@@ -15,12 +15,16 @@ limitations under the License.
 
 #include "tensorflow/core/framework/full_type_util.h"
 
+#include <algorithm>
+#include <string>
+
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/full_type.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 
@@ -79,6 +83,47 @@ OpTypeConstructor UnaryTensorContainer(FullTypeId t, FullTypeId dtype) {
   };
 }
 
+OpTypeConstructor UnaryTensorContainer(FullTypeId t, const string& var_name) {
+  return [t, var_name](OpDef* op_def) {
+    FullTypeDef* tdef =
+        op_def->mutable_output_arg(0)->mutable_experimental_full_type();
+    tdef->set_type_id(t);
+
+    FullTypeDef* targ = tdef->add_args();
+    targ->set_type_id(TFT_TENSOR);
+    FullTypeDef* varg = targ->add_args();
+    varg->set_type_id(TFT_VAR);
+    varg->set_s(var_name);
+
+    return Status::OK();
+  };
+}
+
+OpTypeConstructor VariadicTensorContainer(FullTypeId t,
+                                          const string& var_name) {
+  return [t, var_name](OpDef* op_def) {
+    FullTypeDef* tdef =
+        op_def->mutable_output_arg(0)->mutable_experimental_full_type();
+    tdef->set_type_id(t);
+
+    FullTypeDef* for_each = tdef->add_args();
+    for_each->set_type_id(TFT_FOR_EACH);
+    for_each->add_args()->set_type_id(TFT_PRODUCT);
+
+    FullTypeDef* tpl = for_each->add_args();
+    tpl->set_type_id(TFT_TENSOR);
+    FullTypeDef* targ = tpl->add_args();
+    targ->set_type_id(TFT_VAR);
+    targ->set_s(var_name);
+
+    FullTypeDef* tvar = for_each->add_args();
+    tvar->set_type_id(TFT_VAR);
+    tvar->set_s(var_name);
+
+    return Status::OK();
+  };
+}
+
 StatusOr<FullTypeDef> SpecializeType(const AttrSlice& attrs,
                                      const OpDef& op_def) {
   FullTypeDef ft;
@@ -132,6 +177,92 @@ StatusOr<FullTypeDef> SpecializeType(const AttrSlice& attrs,
   return ft;
 }
 
+const FullTypeDef& GetArgDefaultUnset(const FullTypeDef& t, int i) {
+  static FullTypeDef* unset_type = []() {
+    FullTypeDef* t = new FullTypeDef();
+    return t;
+  }();
+
+  if (i < t.args_size()) {
+    return t.args(i);
+  }
+  return *unset_type;
+}
+
+const FullTypeDef& GetArgDefaultAny(const FullTypeDef& t, int i) {
+  static FullTypeDef* any_type = []() {
+    FullTypeDef* t = new FullTypeDef();
+    t->set_type_id(TFT_ANY);
+    return t;
+  }();
+
+  if (i < t.args_size()) {
+    const FullTypeDef& f_val = t.args(i);
+    if (f_val.type_id() == TFT_UNSET) {
+      return *any_type;
+    }
+    return f_val;
+  }
+  return *any_type;
+}
+
+bool IsEqual(const FullTypeDef& lhs, const FullTypeDef& rhs) {
+  if (lhs.type_id() != rhs.type_id()) {
+    return false;
+  }
+  const auto& lhs_s = lhs.s();
+  const auto& rhs_s = rhs.s();
+  if (lhs_s.empty()) {
+    if (!rhs_s.empty()) {
+      return false;
+    }
+  } else if (rhs_s != lhs_s) {
+    return false;
+  }
+  for (int i = 0; i < std::max(lhs.args_size(), rhs.args_size()); i++) {
+    const FullTypeDef& lhs_arg = GetArgDefaultAny(lhs, i);
+    const FullTypeDef& rhs_arg = GetArgDefaultAny(rhs, i);
+
+    if (!IsEqual(lhs_arg, rhs_arg)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool IsSubtype(const FullTypeDef& lhs, const FullTypeDef& rhs, bool covariant) {
+  // Rule: ANY is a supertype of all types.
+  if (rhs.type_id() == TFT_ANY) {
+    return true;
+  }
+  // Compatibility rule: UNSET is treated as ANY for the purpose of subtyping.
+  if (rhs.type_id() == TFT_UNSET) {
+    return true;
+  }
+  // Default rule: type IDs must match.
+  if (lhs.type_id() != rhs.type_id()) {
+    return false;
+  }
+
+  for (int i = 0; i < std::max(lhs.args_size(), rhs.args_size()); i++) {
+    const FullTypeDef& lhs_arg = GetArgDefaultAny(lhs, i);
+    const FullTypeDef& rhs_arg = GetArgDefaultAny(rhs, i);
+
+    if (covariant) {
+      if (!IsSubtype(lhs_arg, rhs_arg)) {
+        return false;
+      }
+    } else {
+      if (!IsSubtype(rhs_arg, lhs_arg)) {
+        return false;
+      }
+    }
+  }
+
+  // Invariant: type IDs are eaqual, and all args are subtype of one another.
+  return true;
+}
+
 }  // namespace full_type
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/full_type_util.h b/tensorflow/core/framework/full_type_util.h
index 44d6ad6d93ce11..f6fe84ee9636b0 100644
--- a/tensorflow/core/framework/full_type_util.h
+++ b/tensorflow/core/framework/full_type_util.h
@@ -31,16 +31,19 @@ namespace tensorflow {
 namespace full_type {
 
 // TODO(mdan): Specific helpers won't get too far. Use a parser instead.
+// TODO(mdan): Move constructors into a separate file.
 
 // Helpers that allow shorthand expression for the more common kinds of type
 // constructors.
 // Note: The arity below refers to the number of arguments of parametric types,
 // not to the number of return values from a particular op.
+// Note: Type constructors are meant to create static type definitions in the
+// op definition (i.e. the OpDef proto).
 
 // Helper for a type constructor of <t>[] (with no parameters).
 OpTypeConstructor Nullary(FullTypeId t);
 
-// Helper for a type constructor of <t>[FT_VAR[<param_name>]].
+// Helper for a type constructor of <t>[FT_VAR[<var_name>]].
 OpTypeConstructor Unary(FullTypeId t, const string& var_name);
 
 // Helper for a type constructor of <t>[FT_ANY].
@@ -49,6 +52,17 @@ OpTypeConstructor UnaryGeneric(FullTypeId t);
 // Helper for a type constructor of <t>[FT_TENSOR[<dtype>]].
 OpTypeConstructor UnaryTensorContainer(FullTypeId t, FullTypeId dtype);
 
+// Helper for a type constructor of <t>[FT_VAR[<var_name>]].
+OpTypeConstructor UnaryTensorContainer(FullTypeId t, const string& var_name);
+
+// Helper for a type constructor of
+// <t>[FT_FOR_EACH[
+//     FT_PRODUCT,
+//     FT_TENSOR[FT_VAR[<var_name>]],
+//     FT_VAR[<var_name>]].
+// Multi-valued type variables will expand the template (see full_type.proto).
+OpTypeConstructor VariadicTensorContainer(FullTypeId t, const string& var_name);
+
 // Type specialization and inference logic. This function narrows the type
 // specified in an op definition. Such types are usually generic and dependent
 // on input types. This function resolves the output types based on the input
@@ -56,6 +70,14 @@ OpTypeConstructor UnaryTensorContainer(FullTypeId t, FullTypeId dtype);
 StatusOr<FullTypeDef> SpecializeType(const AttrSlice& attrs,
                                      const OpDef& op_def);
 
+const FullTypeDef& GetArgDefaultUnset(const FullTypeDef& t, int i);
+const FullTypeDef& GetArgDefaultAny(const FullTypeDef& t, int i);
+
+bool IsEqual(const FullTypeDef& lhs, const FullTypeDef& rhs);
+
+bool IsSubtype(const FullTypeDef& lhs, const FullTypeDef& rhs,
+               bool covariant = true);
+
 }  // namespace full_type
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/full_type_util_test.cc b/tensorflow/core/framework/full_type_util_test.cc
new file mode 100644
index 00000000000000..6cea3cfad6400c
--- /dev/null
+++ b/tensorflow/core/framework/full_type_util_test.cc
@@ -0,0 +1,401 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/full_type.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+namespace full_type {
+
+namespace {
+
+// TODO(mdan): Use ParseTextProto, ProtoEquals when available in a clean lib.
+
+TEST(Nullary, Basic) {
+  OpTypeConstructor ctor = Nullary(TFT_TENSOR);
+
+  OpDef op;
+  op.add_output_arg();
+
+  TF_ASSERT_OK(ctor(&op));
+
+  const FullTypeDef& t = op.output_arg(0).experimental_full_type();
+  EXPECT_EQ(t.type_id(), TFT_TENSOR);
+  EXPECT_EQ(t.args_size(), 0);
+}
+
+TEST(Unary, Basic) {
+  OpTypeConstructor ctor = Unary(TFT_TENSOR, "T");
+
+  OpDef op;
+  op.add_output_arg();
+
+  TF_ASSERT_OK(ctor(&op));
+
+  const FullTypeDef& t = op.output_arg(0).experimental_full_type();
+  EXPECT_EQ(t.type_id(), TFT_TENSOR);
+  EXPECT_EQ(t.args_size(), 1);
+  EXPECT_EQ(t.args(0).type_id(), TFT_VAR);
+  EXPECT_EQ(t.args(0).args_size(), 0);
+  EXPECT_EQ(t.args(0).s(), "T");
+}
+
+TEST(UnaryGeneric, Basic) {
+  OpTypeConstructor ctor = UnaryGeneric(TFT_TENSOR);
+
+  OpDef op;
+  op.add_output_arg();
+
+  TF_ASSERT_OK(ctor(&op));
+
+  const FullTypeDef& t = op.output_arg(0).experimental_full_type();
+  EXPECT_EQ(t.type_id(), TFT_TENSOR);
+  EXPECT_EQ(t.args_size(), 1);
+  EXPECT_EQ(t.args(0).type_id(), TFT_ANY);
+  EXPECT_EQ(t.args(0).args_size(), 0);
+}
+
+TEST(UnaryTensorContainer, Fixed) {
+  OpTypeConstructor ctor = UnaryTensorContainer(TFT_ARRAY, TFT_INT32);
+
+  OpDef op;
+  op.add_output_arg();
+
+  TF_ASSERT_OK(ctor(&op));
+
+  const FullTypeDef& t = op.output_arg(0).experimental_full_type();
+  EXPECT_EQ(t.type_id(), TFT_ARRAY);
+  EXPECT_EQ(t.args_size(), 1);
+  EXPECT_EQ(t.args(0).type_id(), TFT_TENSOR);
+  EXPECT_EQ(t.args(0).args_size(), 1);
+  EXPECT_EQ(t.args(0).args(0).type_id(), TFT_INT32);
+  EXPECT_EQ(t.args(0).args(0).args_size(), 0);
+}
+
+TEST(UnaryTensorContainer, Dependent) {
+  OpTypeConstructor ctor = UnaryTensorContainer(TFT_ARRAY, "T");
+
+  OpDef op;
+  op.add_output_arg();
+
+  TF_ASSERT_OK(ctor(&op));
+
+  const FullTypeDef& t = op.output_arg(0).experimental_full_type();
+  EXPECT_EQ(t.type_id(), TFT_ARRAY);
+  EXPECT_EQ(t.args_size(), 1);
+  EXPECT_EQ(t.args(0).type_id(), TFT_TENSOR);
+  EXPECT_EQ(t.args(0).args_size(), 1);
+  EXPECT_EQ(t.args(0).args(0).type_id(), TFT_VAR);
+  EXPECT_EQ(t.args(0).args(0).args_size(), 0);
+  EXPECT_EQ(t.args(0).args(0).s(), "T");
+}
+
+TEST(VariadicTensorContainer, Basic) {
+  OpTypeConstructor ctor = VariadicTensorContainer(TFT_ARRAY, "T");
+
+  OpDef op;
+  op.add_output_arg();
+
+  TF_ASSERT_OK(ctor(&op));
+
+  const FullTypeDef& t = op.output_arg(0).experimental_full_type();
+  EXPECT_EQ(t.type_id(), TFT_ARRAY);
+  EXPECT_EQ(t.args_size(), 1);
+  EXPECT_EQ(t.args(0).type_id(), TFT_FOR_EACH);
+  EXPECT_EQ(t.args(0).args_size(), 3);
+  EXPECT_EQ(t.args(0).args(0).type_id(), TFT_PRODUCT);
+  EXPECT_EQ(t.args(0).args(0).args_size(), 0);
+  EXPECT_EQ(t.args(0).args(1).type_id(), TFT_TENSOR);
+  EXPECT_EQ(t.args(0).args(1).args_size(), 1);
+  EXPECT_EQ(t.args(0).args(1).args(0).type_id(), TFT_VAR);
+  EXPECT_EQ(t.args(0).args(1).args(0).args_size(), 0);
+  EXPECT_EQ(t.args(0).args(1).args(0).s(), "T");
+  EXPECT_EQ(t.args(0).args(2).type_id(), TFT_VAR);
+  EXPECT_EQ(t.args(0).args(2).args_size(), 0);
+  EXPECT_EQ(t.args(0).args(2).s(), "T");
+}
+
+TEST(GetArgDefaults, DefaultUnsetFromNoArgs) {
+  FullTypeDef t;
+
+  const auto& d = GetArgDefaultUnset(t, 0);
+
+  EXPECT_EQ(d.type_id(), TFT_UNSET);
+}
+
+TEST(GetArgDefaults, DefaultUnsetFromOutOfBounds) {
+  FullTypeDef t;
+  t.add_args()->set_type_id(TFT_TENSOR);
+
+  const auto& d = GetArgDefaultUnset(t, 1);
+
+  EXPECT_EQ(d.type_id(), TFT_UNSET);
+}
+
+TEST(GetArgDefaults, NoDefaultUnsetFromArg) {
+  FullTypeDef t;
+  t.add_args()->set_type_id(TFT_TENSOR);
+  t.mutable_args(0)->add_args();
+
+  const auto& d = GetArgDefaultUnset(t, 0);
+
+  EXPECT_EQ(d.type_id(), TFT_TENSOR);
+  EXPECT_EQ(d.args_size(), 1);
+}
+
+TEST(GetArgDefaults, DefaultAnyFromNoArgs) {
+  FullTypeDef t;
+
+  const auto& d = GetArgDefaultAny(t, 0);
+
+  EXPECT_EQ(d.type_id(), TFT_ANY);
+}
+
+TEST(GetArgDefaults, DefaultAnyFromOutOfBounds) {
+  FullTypeDef t;
+  t.add_args()->set_type_id(TFT_TENSOR);
+
+  const auto& d = GetArgDefaultAny(t, 1);
+
+  EXPECT_EQ(d.type_id(), TFT_ANY);
+}
+
+TEST(GetArgDefaults, DefaultAnyFromUnset) {
+  FullTypeDef t;
+  t.add_args();
+
+  const auto& d = GetArgDefaultAny(t, 0);
+
+  EXPECT_EQ(d.type_id(), TFT_ANY);
+}
+
+TEST(GetArgDefaults, NoDefaultAnyFromArg) {
+  FullTypeDef t;
+  t.add_args()->set_type_id(TFT_TENSOR);
+  t.mutable_args(0)->add_args();
+
+  const auto& d = GetArgDefaultAny(t, 0);
+
+  EXPECT_EQ(d.type_id(), TFT_TENSOR);
+  EXPECT_EQ(d.args_size(), 1);
+}
+
+TEST(IsEqual, Reflexivity) {
+  FullTypeDef t;
+  t.set_type_id(TFT_TENSOR);
+  t.add_args()->set_type_id(TFT_INT32);
+  t.add_args()->set_type_id(TFT_INT64);
+
+  EXPECT_TRUE(IsEqual(t, t));
+}
+
+TEST(IsEqual, Copy) {
+  FullTypeDef t;
+  t.set_type_id(TFT_TENSOR);
+  t.add_args()->set_type_id(TFT_INT32);
+  t.add_args()->set_type_id(TFT_INT64);
+
+  FullTypeDef u;
+  u = t;
+  EXPECT_TRUE(IsEqual(t, u));
+  EXPECT_TRUE(IsEqual(u, t));
+}
+
+TEST(IsEqual, DifferentTypesNotEqual) {
+  FullTypeDef t;
+  t.set_type_id(TFT_TENSOR);
+  t.add_args()->set_type_id(TFT_INT32);
+  t.add_args()->set_type_id(TFT_INT64);
+
+  FullTypeDef u;
+  u = t;
+  u.set_type_id(TFT_ARRAY);
+
+  EXPECT_FALSE(IsEqual(t, u));
+  EXPECT_FALSE(IsEqual(u, t));
+}
+
+TEST(IsEqual, DifferentAritiesNotEqual) {
+  FullTypeDef t;
+  t.set_type_id(TFT_TENSOR);
+  t.add_args()->set_type_id(TFT_INT32);
+  t.add_args()->set_type_id(TFT_INT64);
+
+  FullTypeDef u;
+  u = t;
+  u.add_args()->set_type_id(TFT_FLOAT);
+
+  EXPECT_FALSE(IsEqual(t, u));
+  EXPECT_FALSE(IsEqual(u, t));
+}
+
+TEST(IsEqual, MissingArgsEquivalentToAny) {
+  FullTypeDef t;
+  t.set_type_id(TFT_TENSOR);
+  t.add_args()->set_type_id(TFT_INT32);
+
+  FullTypeDef u;
+  u = t;
+  u.add_args()->set_type_id(TFT_ANY);
+
+  EXPECT_TRUE(IsEqual(t, u));
+  EXPECT_TRUE(IsEqual(u, t));
+}
+
+TEST(IsEqual, DifferentArgsNotEqual) {
+  FullTypeDef t;
+  t.set_type_id(TFT_TENSOR);
+  t.add_args()->set_type_id(TFT_INT32);
+  t.add_args()->set_type_id(TFT_INT64);
+
+  FullTypeDef u;
+  u = t;
+  u.mutable_args(1)->set_type_id(TFT_FLOAT);
+
+  EXPECT_FALSE(IsEqual(t, u));
+  EXPECT_FALSE(IsEqual(u, t));
+}
+
+TEST(IsEqual, DifferentStringValuesNotEqual) {
+  FullTypeDef t;
+  t.set_type_id(TFT_VAR);
+  t.set_s("T");
+
+  FullTypeDef u;
+  u = t;
+  u.set_type_id(TFT_VAR);
+  u.set_s("U");
+
+  EXPECT_FALSE(IsEqual(t, u));
+  EXPECT_FALSE(IsEqual(u, t));
+}
+
+TEST(IsSubtype, Reflexivity) {
+  FullTypeDef t;
+  t.set_type_id(TFT_TENSOR);
+  t.add_args()->set_type_id(TFT_INT32);
+  t.add_args()->set_type_id(TFT_INT64);
+
+  EXPECT_TRUE(IsSubtype(t, t));
+}
+
+TEST(IsSubtype, Copy) {
+  FullTypeDef t;
+  t.set_type_id(TFT_TENSOR);
+  t.add_args()->set_type_id(TFT_INT32);
+  t.add_args()->set_type_id(TFT_INT64);
+
+  FullTypeDef u;
+  u = t;
+  EXPECT_TRUE(IsSubtype(t, u));
+}
+
+TEST(IsSubtype, Any) {
+  FullTypeDef t;
+  t.set_type_id(TFT_TENSOR);
+  t.add_args()->set_type_id(TFT_INT32);
+  t.add_args()->set_type_id(TFT_INT64);
+
+  FullTypeDef u;
+  u.set_type_id(TFT_ANY);
+
+  EXPECT_TRUE(IsSubtype(t, u));
+  EXPECT_FALSE(IsSubtype(u, t));
+}
+
+TEST(IsSubtype, Unset) {
+  FullTypeDef t;
+  t.set_type_id(TFT_TENSOR);
+  t.add_args()->set_type_id(TFT_INT32);
+  t.add_args()->set_type_id(TFT_INT64);
+
+  FullTypeDef u;
+  u.set_type_id(TFT_UNSET);
+
+  EXPECT_TRUE(IsSubtype(t, u));
+  EXPECT_FALSE(IsSubtype(u, t));
+}
+
+TEST(IsSubtype, Covariance) {
+  FullTypeDef t;
+  t.set_type_id(TFT_TENSOR);
+  t.add_args()->set_type_id(TFT_ARRAY);
+  t.mutable_args(0)->add_args()->set_type_id(TFT_INT32);
+
+  FullTypeDef u;
+  u.set_type_id(TFT_TENSOR);
+  u.add_args()->set_type_id(TFT_ANY);
+
+  EXPECT_TRUE(IsSubtype(t, u, /*covariant=*/true));
+  EXPECT_FALSE(IsSubtype(u, t, /*covariant=*/true));
+
+  EXPECT_FALSE(IsSubtype(t, u, /*covariant=*/false));
+  EXPECT_TRUE(IsSubtype(u, t, /*covariant=*/false));
+}
+
+TEST(IsSubtype, DifferentTypesNotSubtype) {
+  FullTypeDef t;
+  t.set_type_id(TFT_TENSOR);
+  t.add_args()->set_type_id(TFT_INT32);
+  t.add_args()->set_type_id(TFT_INT64);
+
+  FullTypeDef u;
+  u = t;
+  u.set_type_id(TFT_ARRAY);
+
+  EXPECT_FALSE(IsSubtype(t, u));
+  EXPECT_FALSE(IsSubtype(u, t));
+}
+
+TEST(IsSubtype, DifferentAritiesDefaultToAny) {
+  FullTypeDef t;
+  t.set_type_id(TFT_TENSOR);
+  t.add_args()->set_type_id(TFT_INT32);
+  t.add_args()->set_type_id(TFT_INT64);
+
+  FullTypeDef u;
+  u = t;
+  u.add_args()->set_type_id(TFT_FLOAT);
+
+  EXPECT_FALSE(IsSubtype(t, u));
+  EXPECT_TRUE(IsSubtype(u, t));
+}
+
+TEST(IsSubtype, DifferentArgsNotSubtype) {
+  FullTypeDef t;
+  t.set_type_id(TFT_TENSOR);
+  t.add_args()->set_type_id(TFT_INT32);
+  t.add_args()->set_type_id(TFT_INT64);
+
+  FullTypeDef u;
+  u = t;
+  u.mutable_args(1)->set_type_id(TFT_FLOAT);
+
+  EXPECT_FALSE(IsSubtype(t, u));
+  EXPECT_FALSE(IsSubtype(u, t));
+}
+
+}  // namespace
+
+}  // namespace full_type
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 0dbc703cbe7301..5b03da2658a0d1 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -768,6 +768,14 @@ class FunctionLibraryRuntime {
     // set of small functions.  For example, running kernels synchronously can
     // be faster under some conditions.
     bool allow_small_function_optimizations = false;
+
+    // Force int32 _Arg and _Retvals nodes to be left on device instead of
+    // pinning to host.
+    //
+    // Note that we do not pin int32 nodes to host for subgraphs running in
+    // TPU/XLA devices. So this is mainly used to handle the case of multi-CPU
+    // and GPU (non-XLA) graphs.
+    bool int_args_and_retvals_on_device = false;
   };
   typedef uint64 Handle;
   virtual Status Instantiate(const std::string& function_name, AttrSlice attrs,
diff --git a/tensorflow/core/framework/tensor_shape.cc b/tensorflow/core/framework/tensor_shape.cc
index d3b2f68a8fb890..e30872f6366317 100644
--- a/tensorflow/core/framework/tensor_shape.cc
+++ b/tensorflow/core/framework/tensor_shape.cc
@@ -17,7 +17,6 @@ limitations under the License.
 
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/errors.h"
diff --git a/tensorflow/core/framework/tensor_shape.h b/tensorflow/core/framework/tensor_shape.h
index f0e84f58e3ce7c..d677467cee8871 100644
--- a/tensorflow/core/framework/tensor_shape.h
+++ b/tensorflow/core/framework/tensor_shape.h
@@ -20,9 +20,6 @@ limitations under the License.
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/strings/str_util.h"
diff --git a/tensorflow/core/ir/importexport/BUILD b/tensorflow/core/ir/importexport/BUILD
index ef271ed923b384..a6df09ce66df2d 100644
--- a/tensorflow/core/ir/importexport/BUILD
+++ b/tensorflow/core/ir/importexport/BUILD
@@ -17,7 +17,10 @@ cc_library(
     hdrs = [
         "import.h",
     ],
-    visibility = ["//tensorflow/core:__subpackages__"],
+    visibility = [
+        "//tensorflow/compiler/mlir/tensorflow:__subpackages__",
+        "//tensorflow/core:__subpackages__",
+    ],
     deps = [
         ":convert_attributes",
         ":convert_tensor",
diff --git a/tensorflow/core/ir/importexport/convert_attributes.cc b/tensorflow/core/ir/importexport/convert_attributes.cc
index 847643a7f4d670..fd2edf483ee137 100644
--- a/tensorflow/core/ir/importexport/convert_attributes.cc
+++ b/tensorflow/core/ir/importexport/convert_attributes.cc
@@ -365,6 +365,8 @@ tensorflow::StatusOr<Attribute> ConvertNonFuncAttributeValue(
           TF_ASSIGN_OR_RETURN(
               auto attr,
               ConvertAttributeValue(subattr.second, builder, tfgDialect));
+          if (subattr.first.empty())
+            return InvalidArgument("empty func_attr name");
           subattrs.push_back(builder.getNamedAttr(subattr.first, attr));
         }
         attrs.push_back(FuncAttr::get(builder.getContext(), func_attr.name(),
@@ -389,6 +391,7 @@ tensorflow::StatusOr<Attribute> ConvertAttributeValue(
     case AttrValue::kFunc: {
       NamedAttrList attrs;
       for (const auto& func_attr : value.func().attr()) {
+        if (func_attr.first.empty()) return InvalidArgument("empty attr name");
         TF_ASSIGN_OR_RETURN(
             auto attr,
             ConvertAttributeValue(func_attr.second, builder, tfgDialect));
diff --git a/tensorflow/core/ir/importexport/functiondef_import.cc b/tensorflow/core/ir/importexport/functiondef_import.cc
index 929bff2ee0cfbf..ad63e805dded4d 100644
--- a/tensorflow/core/ir/importexport/functiondef_import.cc
+++ b/tensorflow/core/ir/importexport/functiondef_import.cc
@@ -165,6 +165,7 @@ Status ImportNodes(ValueMapManager value_manager,
   // Process every node and create a matching MLIR operation
   for (const NodeDef& node : nodes) {
     DVLOG(0) << "Processing node " << node.name() << "\n";
+    if (node.op().empty()) return InvalidArgument("empty op type");
     OperationState state(unknown_loc, absl::StrCat("tfg.", node.op()));
     // Fetch the inputs, creating placeholder if an input hasn't been visited.
     for (const std::string& input : node.input())
@@ -255,6 +256,8 @@ Status ImportGenericFunction(
   TFGraphDialect* tfgDialect = cast<TFGraphDialect>(func_op->getDialect());
   NamedAttrList attrs;
   DictionaryAttr func_attrs = builder.getDictionaryAttr({});
+  if (signature.name().empty())
+    return InvalidArgument("generic function without a name");
   attrs.append("sym_name", builder.getStringAttr(signature.name()));
   attrs.append("generic", builder.getUnitAttr());
   if (!signature.description().empty())
diff --git a/tensorflow/core/ir/importexport/import.cc b/tensorflow/core/ir/importexport/import.cc
index 57b1a2174b9690..7542d2e2ebb39e 100644
--- a/tensorflow/core/ir/importexport/import.cc
+++ b/tensorflow/core/ir/importexport/import.cc
@@ -588,6 +588,7 @@ Status GraphImporter::ConvertNode(const Node& node) {
                       StringAttr::get(context_, node.name()));
   for (const auto& namedAttr : node.attrs()) {
     const std::string& name = namedAttr.first;
+    if (name.empty()) return InvalidArgument("empty attr name");
     const AttrValue& tf_attr = namedAttr.second;
     TF_ASSIGN_OR_RETURN(Attribute attr,
                         ConvertAttributeValue(tf_attr, builder_, dialect_));
@@ -729,6 +730,8 @@ tensorflow::StatusOr<GraphFuncOp> ImportFunctionDef(
                         ConvertAttributeValue(tf_attr, builder, tfgDialect));
     attrs.append(name, attr);
   }
+  if (signature.name().empty())
+    return InvalidArgument("function without a name");
   attrs.append("sym_name", builder.getStringAttr(name));
 
   if (!signature.description().empty())
@@ -918,6 +921,32 @@ bool IsGenericFunction(FunctionDef fdef) {
   return false;
 }
 
+}  // namespace
+
+// Convert an array of "handle_data" (a DType and a Shape) to an MLIR array
+// attribute. Each entry will be itself an ArrayAttribute containing a TypeAttr
+// and a ShapeAttr
+tensorflow::StatusOr<ArrayAttr> ConvertHandleData(
+    Builder builder,
+    const RepeatedPtrField<ResourceHandleProto_DtypeAndShape>& handle_data) {
+  // Two entries: a type and a shape.
+  SmallVector<Attribute> dtype_and_shape;
+  for (const auto& handle : handle_data) {
+    if (handle.dtype() == tensorflow::DT_INVALID)
+      return InvalidArgument("Invalid dtype for handle_data");
+    Type dtype;
+    TF_RETURN_IF_ERROR(ConvertDataType(handle.dtype(), builder, &dtype));
+    TF_ASSIGN_OR_RETURN(
+        Attribute shape,
+        ConvertTensorShapeProto(handle.shape(), builder.getContext()));
+
+    dtype_and_shape.push_back(
+        builder.getArrayAttr({TypeAttr::get(dtype), shape}));
+  }
+  return builder.getArrayAttr(dtype_and_shape);
+}
+// Convert a Graph and function libs to a MLIR module containing the graph and
+// expressed in TFG dialect.
 tensorflow::StatusOr<OwningModuleRef> ImportGraphAndFunctionsToMlir(
     MLIRContext* context, const Graph& graph, const GraphDebugInfo& debug_info,
     const FunctionLibraryDefinition& flib_def) {
@@ -951,30 +980,8 @@ tensorflow::StatusOr<OwningModuleRef> ImportGraphAndFunctionsToMlir(
   return module;
 }
 
-}  // namespace
-
-// Convert an array of "handle_data" (a DType and a Shape) to an MLIR array
-// attribute. Each entry will be itself an ArrayAttribute containing a TypeAttr
-// and a ShapeAttr
-tensorflow::StatusOr<ArrayAttr> ConvertHandleData(
-    Builder builder,
-    const RepeatedPtrField<ResourceHandleProto_DtypeAndShape>& handle_data) {
-  // Two entries: a type and a shape.
-  SmallVector<Attribute> dtype_and_shape;
-  for (const auto& handle : handle_data) {
-    Type dtype;
-    if (handle.dtype() != tensorflow::DT_INVALID)
-      TF_RETURN_IF_ERROR(ConvertDataType(handle.dtype(), builder, &dtype));
-    TF_ASSIGN_OR_RETURN(
-        Attribute shape,
-        ConvertTensorShapeProto(handle.shape(), builder.getContext()));
-
-    dtype_and_shape.push_back(
-        builder.getArrayAttr({TypeAttr::get(dtype), shape}));
-  }
-  return builder.getArrayAttr(dtype_and_shape);
-}
-
+// Convert a GraphDef to a MLIR module containing the graph and expressed in TFG
+// dialect.
 tensorflow::StatusOr<OwningModuleRef> ImportGraphDefToMlir(
     MLIRContext* context, const GraphDebugInfo& debug_info,
     const GraphDef& graphdef) {
diff --git a/tensorflow/core/ir/importexport/import.h b/tensorflow/core/ir/importexport/import.h
index 9b3666a8fb05b1..8551764d3f4f2d 100644
--- a/tensorflow/core/ir/importexport/import.h
+++ b/tensorflow/core/ir/importexport/import.h
@@ -33,6 +33,15 @@ limitations under the License.
 namespace mlir {
 namespace tfg {
 
+// Convert a Graph and function libs to a MLIR module containing the graph and
+// expressed in TFG dialect.
+tensorflow::StatusOr<OwningModuleRef> ImportGraphAndFunctionsToMlir(
+    MLIRContext* context, const tensorflow::Graph& graph,
+    const tensorflow::GraphDebugInfo& debug_info,
+    const tensorflow::FunctionLibraryDefinition& flib_def);
+
+// Convert a GraphDef to a MLIR module containing the graph and expressed in TFG
+// dialect.
 tensorflow::StatusOr<OwningModuleRef> ImportGraphDefToMlir(
     MLIRContext* context, const tensorflow::GraphDebugInfo& debug_info,
     const tensorflow::GraphDef& graphdef);
diff --git a/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_empty_attr_key.pbtxt b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_empty_attr_key.pbtxt
new file mode 100644
index 00000000000000..5e9c003b010451
--- /dev/null
+++ b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_empty_attr_key.pbtxt
@@ -0,0 +1,55 @@
+# RUN: not tfg-translate -graphdef-to-mlir %s 2>&1 | FileCheck %s
+
+# CHECK: empty attr name
+
+node {
+  name: "SaveV/"
+  op: "PartitionedCall"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: ""
+    value {
+    }
+  }
+  attr {
+    key: "Tin"
+    value {
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+    }
+  }
+  attr {
+    key: "config"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "config_proto"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "executor_type"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "f"
+    value {
+      func {
+      }
+    }
+  }
+}
+library {
+  function {
+  }
+}
+versions {
+}
diff --git a/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_empty_func_attr_key.pbtxt b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_empty_func_attr_key.pbtxt
new file mode 100644
index 00000000000000..cc4742efcc05ab
--- /dev/null
+++ b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_empty_func_attr_key.pbtxt
@@ -0,0 +1,165 @@
+# RUN: not tfg-translate -graphdef-to-mlir %s 2>&1 | FileCheck %s
+
+# CHECK: empty attr name
+
+node {
+  name: "SaveV/"
+  op: "PartitionedCall"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "T"
+    value {
+      func {
+        attr {
+          key: ""
+          value {
+          }
+        }
+        attr {
+          key: "\036"
+          value {
+          }
+        }
+        attr {
+          key: " "
+          value {
+          }
+        }
+        attr {
+          key: "1"
+          value {
+          }
+        }
+        attr {
+          key: "2"
+          value {
+          }
+        }
+        attr {
+          key: "loc:@c"
+          value {
+          }
+        }
+        attr {
+          key: "\177"
+          value {
+            func {
+              attr {
+                key: ""
+                value {
+                }
+              }
+              attr {
+                key: "\036"
+                value {
+                }
+              }
+              attr {
+                key: "1"
+                value {
+                }
+              }
+              attr {
+                key: "2"
+                value {
+                }
+              }
+              attr {
+                key: "_class"
+                value {
+                }
+              }
+              attr {
+                key: "\177"
+                value {
+                }
+              }
+              attr {
+                key: "\177\177"
+                value {
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "Tin"
+    value {
+    }
+  }
+  attr {
+    key: "Tout"
+    value {
+    }
+  }
+  attr {
+    key: "config"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "config_proto"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "executor_type"
+    value {
+      s: ""
+    }
+  }
+  attr {
+    key: "f"
+    value {
+      func {
+        attr {
+          key: "E"
+          value {
+          }
+        }
+      }
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+    }
+  }
+}
+node {
+  name: "serving_default_x"
+  op: "Placeholder"
+  device: "/job:localhost/replica:0/task:0/device:CPU:0"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_STRING
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        unknown_rank: true
+      }
+    }
+  }
+}
+library {
+  function {
+    signature {
+      is_distributed_communication: true
+    }
+  }
+  gradient {
+    gradient_func: "\001\000\000\000\000\000\000\000"
+  }
+}
+versions {
+}
+
diff --git a/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_empty_func_attr_name.pbtxt b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_empty_func_attr_name.pbtxt
new file mode 100644
index 00000000000000..40be7f8b0ced67
--- /dev/null
+++ b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_empty_func_attr_name.pbtxt
@@ -0,0 +1,38 @@
+# RUN: not tfg-translate -graphdef-to-mlir %s 2>&1 | FileCheck %s
+
+# CHECK: empty func_attr name
+
+node {
+  name: "NoOp"
+  op: "NoOp"
+  attr {
+    key: "dense_shapes"
+    value {
+      list {
+        shape {
+          dim {
+            size: 1
+          }
+        }
+        shape {
+          dim {
+            size: 1
+          }
+        }
+        func {
+          attr {
+            key: ""
+            value {
+              type: DT_QINT16
+            }
+          }
+        }
+      }
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 27
+}
diff --git a/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_empty_op_type.pbtxt b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_empty_op_type.pbtxt
new file mode 100644
index 00000000000000..0ed0763eef75c2
--- /dev/null
+++ b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_empty_op_type.pbtxt
@@ -0,0 +1,22 @@
+# RUN: not tfg-translate -graphdef-to-mlir %s 2>&1 | FileCheck %s
+
+# CHECK: empty op type
+
+library {
+  function {
+    signature {
+      name: "XTimesTwo"
+    }
+    node_def {
+      name: "value"
+      attr {
+        key: "Tin"
+        value {
+          placeholder: "\t"
+        }
+      }
+    }
+  }
+}
+versions {
+}
diff --git a/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_func_with_empty_name.pbtxt b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_func_with_empty_name.pbtxt
new file mode 100644
index 00000000000000..327da3f2153240
--- /dev/null
+++ b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_func_with_empty_name.pbtxt
@@ -0,0 +1,10 @@
+# RUN: not tfg-translate -graphdef-to-mlir %s 2>&1 | FileCheck %s
+
+# CHECK: function without a name
+
+library {
+  function {
+    signature {
+    }
+  }
+}
diff --git a/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_generic_func_with_empty_name.pbtxt b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_generic_func_with_empty_name.pbtxt
new file mode 100644
index 00000000000000..e396699069d6fa
--- /dev/null
+++ b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_generic_func_with_empty_name.pbtxt
@@ -0,0 +1,20 @@
+# RUN: not tfg-translate -graphdef-to-mlir %s 2>&1 | FileCheck %s
+
+# CHECK: generic function without a name
+
+library {
+  function {
+    signature {
+    }
+    node_def {
+      name: "y"
+      op: "NoOp"
+      attr {
+        key: "T"
+        value {
+          placeholder: "T"
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_handle_data.pbtxt b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_handle_data.pbtxt
new file mode 100644
index 00000000000000..2a6fb92da04397
--- /dev/null
+++ b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/invalid_handle_data.pbtxt
@@ -0,0 +1,92 @@
+# RUN: not tfg-translate -graphdef-to-mlir %s 2>&1 | FileCheck %s
+
+# CHECK: INVALID_ARGUMENT: Invalid dtype for handle_data
+
+library {
+  function {
+    signature {
+      name: "XTimesTwo"
+      input_arg {
+        name: "x"
+        type_attr: "T"
+      }
+      output_arg {
+        name: "y"
+        type_attr: "T"
+        # This empty handle_data is invalid.
+        handle_data {
+        }
+      }
+      attr {
+        name: "T"
+        type: "type"
+        allowed_values {
+          list {
+            type: DT_FLOAT
+            type: DT_DOUBLE
+            type: DT_INT32
+            type: DT_INT64
+          }
+        }
+      }
+    }
+    node_def {
+      name: "two"
+      op: "Const"
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_INT64
+            tensor_shape {
+            }
+            int64_val: 2
+          }
+        }
+      }
+    }
+    node_def {
+      name: "scale"
+      op: "Cast"
+      input: "two:output:0"
+      attr {
+        key: "DstT"
+        value {
+          placeholder: "T"
+        }
+      }
+      attr {
+        key: "SrcT"
+        value {
+          type: DT_INT64
+        }
+      }
+    }
+    node_def {
+      name: "y"
+      op: "Mul"
+      input: "x"
+      input: "scale:y:0"
+      attr {
+        key: "T"
+        value {
+          placeholder: "T"
+        }
+      }
+    }
+    ret {
+      key: "y"
+      value: "y:z:0"
+    }
+  }
+}
+versions {
+  producer: 762
+}
+
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 178c8a6f1af823..2e868c65b52e0e 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -9,7 +9,6 @@ load(
     "if_not_windows",
     "if_oss",
     "tf_cc_binary",
-    "tf_cc_shared_object",
     "tf_cc_test",
     "tf_cc_tests",
     "tf_copts",
@@ -68,6 +67,9 @@ load(
     "if_rocm",
 )
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_cc_shared_library")
+
 # Description:
 # Op kernel implementations for TensorFlow.
 #
@@ -1948,10 +1950,11 @@ tf_cc_test(
     ],
 )
 
-tf_cc_test(
+tf_cuda_cc_test(
     name = "fake_quant_ops_test",
     size = "small",
     srcs = ["fake_quant_ops_test.cc"],
+    tags = tf_cuda_tests_tags(),
     deps = [
         ":fake_quant_ops",
         ":ops_testutil",
@@ -2439,6 +2442,9 @@ tf_kernel_library(
 tf_cc_test(
     name = "while_op_test",
     srcs = ["while_op_test.cc"],
+    tags = [
+        "no_windows",
+    ],  # TODO(b/208697533): Re-enable after fixing.
     deps = [
         ":control_flow_ops",
         "//tensorflow/c/experimental/stream_executor",
@@ -3203,6 +3209,7 @@ tf_kernel_library(
 )
 
 SAVE_RESTORE_DEPS = [
+    ":checkpoint_callback_manager",
     ":save_restore_tensor",
     "//tensorflow/core:framework",
     "//tensorflow/core:lib",
@@ -3224,6 +3231,35 @@ tf_kernel_library(
     deps = SAVE_RESTORE_DEPS,
 )
 
+tf_kernel_library(
+    name = "checkpoint_callback_manager",
+    srcs = [
+        "checkpoint_callback_manager.cc",
+    ],
+    hdrs = [
+        "checkpoint_callback_manager.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core/platform:regexp",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_tests(
+    name = "checkpoint_callback_manager_test",
+    size = "small",
+    srcs = ["checkpoint_callback_manager_test.cc"],
+    deps = [
+        ":checkpoint_callback_manager",
+        ":io",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
     name = "save_restore_v2_ops",
     prefix = "save_restore_v2_ops",
@@ -6232,6 +6268,7 @@ filegroup(
         "bincount_op.h",
         "broadcast_to_op.h",
         "bucketize_op.h",
+        "checkpoint_callback_manager.h",
         "concat_lib.h",
         "control_flow_ops.h",
         "conv_2d.h",
@@ -6482,6 +6519,7 @@ filegroup(
         "bincount_op.cc",
         "broadcast_to_op.cc",
         "bucketize_op.cc",
+        "checkpoint_callback_manager.cc",
         "ctc_decoder_ops.cc",
         "decode_padded_raw_op.cc",
         "depthtospace_op.cc",
@@ -7629,8 +7667,57 @@ cc_library(
 )
 
 # Shared object that links all the kernels TF needs.
-tf_cc_shared_object(
+tf_cc_shared_library(
     name = "libtfkernel_all_kernels.so",
+    static_deps = [
+        # copybara:comment_begin(oss only)
+        "@bazel_tools//:__subpackages__",
+        "@boringssl//:__subpackages__",
+        "@com_github_cares_cares//:__subpackages__",
+        "@com_github_googlecloudplatform_tensorflow_gcp_tools//:__subpackages__",
+        "@com_github_grpc_grpc//:__subpackages__",
+        "@com_google_absl//:__subpackages__",
+        "@com_google_googleapis//:__subpackages__",
+        "@com_google_protobuf//:__subpackages__",
+        "@com_googlesource_code_re2//:__subpackages__",
+        "@compute_library//:__subpackages__",
+        "@curl//:__subpackages__",
+        "@double_conversion//:__subpackages__",
+        "@eigen_archive//:__subpackages__",
+        "@farmhash_archive//:__subpackages__",
+        "@farmhash_gpu_archive//:__subpackages__",
+        "@fft2d//:__subpackages__",
+        "@gemmlowp//:__subpackages__",
+        "@gif//:__subpackages__",
+        "@highwayhash//:__subpackages__",
+        "@hwloc//:__subpackages__",
+        "@icu//:__subpackages__",
+        "@jsoncpp_git//:__subpackages__",
+        "@libjpeg_turbo//:__subpackages__",
+        "@libxsmm_archive//:__subpackages__",
+        "@llvm_openmp//:__subpackages__",
+        "@llvm-project//:__subpackages__",
+        "@llvm_terminfo//:__subpackages__",
+        "@llvm_zlib//:__subpackages__",
+        "@lmdb//:__subpackages__",
+        "@local_config_cuda//:__subpackages__",
+        "@local_config_git//:__subpackages__",
+        "@local_config_nccl//:__subpackages__",
+        "@local_config_rocm//:__subpackages__",
+        "@local_config_tensorrt//:__subpackages__",
+        "@local_execution_config_platform//:__subpackages__",
+        "@mkl_dnn_acl_compatible//:__subpackages__",
+        "@mkl_dnn_v1//:__subpackages__",
+        "@nsync//:__subpackages__",
+        "@org_sqlite//:__subpackages__",
+        "@platforms//:__subpackages__",
+        "@png//:__subpackages__",
+        "@snappy//:__subpackages__",
+        "//:__subpackages__",
+        "@upb//:__subpackages__",
+        "@zlib//:__subpackages__",
+        # copybara:comment_end
+    ],
     visibility = ["//visibility:public"],
     deps = [
         ":kernel_platform_strings",
diff --git a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
index 831f6f2f7802dd..2f25ff81554a39 100644
--- a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
@@ -37,7 +37,7 @@ limitations under the License.
 namespace tensorflow {
 
 static void ConvertVectorsToMatrices(
-    const OpInputList bucketized_features_list,
+    OpKernelContext* const context, const OpInputList bucketized_features_list,
     std::vector<tensorflow::TTypes<int32>::ConstMatrix>& bucketized_features) {
   for (const Tensor& tensor : bucketized_features_list) {
     if (tensor.dims() == 1) {
@@ -45,6 +45,10 @@ static void ConvertVectorsToMatrices(
       bucketized_features.emplace_back(
           TTypes<int32>::ConstMatrix(v.data(), v.size(), 1));
     } else {
+      OP_REQUIRES(context, TensorShapeUtils::IsMatrix(tensor.shape()),
+                  errors::Internal("Cannot use tensor as matrix, expected "
+                                   "vector or matrix, received shape ",
+                                   tensor.shape().DebugString()));
       bucketized_features.emplace_back(tensor.matrix<int32>());
     }
   }
@@ -58,6 +62,9 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
  public:
   explicit BoostedTreesTrainingPredictOp(OpKernelConstruction* const context)
       : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
     OP_REQUIRES_OK(context, context->GetAttr("num_bucketized_features",
                                              &num_bucketized_features_));
     OP_REQUIRES_OK(context,
@@ -76,17 +83,26 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
                                                 &bucketized_features_list));
     std::vector<tensorflow::TTypes<int32>::ConstMatrix> bucketized_features;
     bucketized_features.reserve(bucketized_features_list.size());
-    ConvertVectorsToMatrices(bucketized_features_list, bucketized_features);
+    ConvertVectorsToMatrices(context, bucketized_features_list,
+                             bucketized_features);
     const int batch_size = bucketized_features[0].dimension(0);
 
     const Tensor* cached_tree_ids_t;
     OP_REQUIRES_OK(context,
                    context->input("cached_tree_ids", &cached_tree_ids_t));
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(cached_tree_ids_t->shape()),
+                errors::InvalidArgument(
+                    "cached_tree_ids must be a vector, received shape ",
+                    cached_tree_ids_t->shape().DebugString()));
     const auto cached_tree_ids = cached_tree_ids_t->vec<int32>();
 
     const Tensor* cached_node_ids_t;
     OP_REQUIRES_OK(context,
                    context->input("cached_node_ids", &cached_node_ids_t));
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(cached_node_ids_t->shape()),
+                errors::InvalidArgument(
+                    "cached_node_ids must be a vector, received shape ",
+                    cached_node_ids_t->shape().DebugString()));
     const auto cached_node_ids = cached_node_ids_t->vec<int32>();
 
     // Allocate outputs.
@@ -118,9 +134,9 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
       output_partial_logits.setZero();
     } else {
       output_tree_ids.setConstant(latest_tree);
-      auto do_work = [&resource, &bucketized_features, &cached_tree_ids,
-                      &cached_node_ids, &output_partial_logits,
-                      &output_node_ids, latest_tree,
+      auto do_work = [&context, &resource, &bucketized_features,
+                      &cached_tree_ids, &cached_node_ids,
+                      &output_partial_logits, &output_node_ids, latest_tree,
                       this](int64_t start, int64_t end) {
         for (int32_t i = start; i < end; ++i) {
           int32_t tree_id = cached_tree_ids(i);
@@ -138,7 +154,11 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
             // node's value. The following logic handles both of these cases.
             const auto& node_logits = resource->node_value(tree_id, node_id);
             if (!node_logits.empty()) {
-              DCHECK_EQ(node_logits.size(), logits_dimension_);
+              OP_REQUIRES(
+                  context, node_logits.size() == logits_dimension_,
+                  errors::Internal(
+                      "Expected node_logits.size() == logits_dimension_, got ",
+                      node_logits.size(), " vs ", logits_dimension_));
               for (int32_t j = 0; j < logits_dimension_; ++j) {
                 partial_tree_logits[j] -= node_logits[j];
               }
@@ -151,7 +171,11 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
           while (true) {
             if (resource->is_leaf(tree_id, node_id)) {
               const auto& leaf_logits = resource->node_value(tree_id, node_id);
-              DCHECK_EQ(leaf_logits.size(), logits_dimension_);
+              OP_REQUIRES(
+                  context, leaf_logits.size() == logits_dimension_,
+                  errors::Internal(
+                      "Expected leaf_logits.size() == logits_dimension_, got ",
+                      leaf_logits.size(), " vs ", logits_dimension_));
               // Tree is done
               const float tree_weight = resource->GetTreeWeight(tree_id);
               for (int32_t j = 0; j < logits_dimension_; ++j) {
@@ -201,6 +225,9 @@ class BoostedTreesPredictOp : public OpKernel {
  public:
   explicit BoostedTreesPredictOp(OpKernelConstruction* const context)
       : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
     OP_REQUIRES_OK(context, context->GetAttr("num_bucketized_features",
                                              &num_bucketized_features_));
     OP_REQUIRES_OK(context,
@@ -219,7 +246,8 @@ class BoostedTreesPredictOp : public OpKernel {
                                                 &bucketized_features_list));
     std::vector<tensorflow::TTypes<int32>::ConstMatrix> bucketized_features;
     bucketized_features.reserve(bucketized_features_list.size());
-    ConvertVectorsToMatrices(bucketized_features_list, bucketized_features);
+    ConvertVectorsToMatrices(context, bucketized_features_list,
+                             bucketized_features);
     const int batch_size = bucketized_features[0].dimension(0);
 
     // Allocate outputs.
@@ -236,8 +264,8 @@ class BoostedTreesPredictOp : public OpKernel {
     }
 
     const int32_t last_tree = resource->num_trees() - 1;
-    auto do_work = [&resource, &bucketized_features, &output_logits, last_tree,
-                    this](int64_t start, int64_t end) {
+    auto do_work = [&context, &resource, &bucketized_features, &output_logits,
+                    last_tree, this](int64_t start, int64_t end) {
       for (int32_t i = start; i < end; ++i) {
         std::vector<float> tree_logits(logits_dimension_, 0.0);
         int32_t tree_id = 0;
@@ -246,7 +274,11 @@ class BoostedTreesPredictOp : public OpKernel {
           if (resource->is_leaf(tree_id, node_id)) {
             const float tree_weight = resource->GetTreeWeight(tree_id);
             const auto& leaf_logits = resource->node_value(tree_id, node_id);
-            DCHECK_EQ(leaf_logits.size(), logits_dimension_);
+            OP_REQUIRES(
+                context, leaf_logits.size() == logits_dimension_,
+                errors::Internal(
+                    "Expected leaf_logits.size() == logits_dimension_, got ",
+                    leaf_logits.size(), " vs ", logits_dimension_));
             for (int32_t j = 0; j < logits_dimension_; ++j) {
               tree_logits[j] += tree_weight * leaf_logits[j];
             }
@@ -298,6 +330,9 @@ class BoostedTreesExampleDebugOutputsOp : public OpKernel {
   explicit BoostedTreesExampleDebugOutputsOp(
       OpKernelConstruction* const context)
       : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
     OP_REQUIRES_OK(context, context->GetAttr("num_bucketized_features",
                                              &num_bucketized_features_));
     OP_REQUIRES_OK(context,
@@ -319,7 +354,8 @@ class BoostedTreesExampleDebugOutputsOp : public OpKernel {
                                                 &bucketized_features_list));
     std::vector<tensorflow::TTypes<int32>::ConstMatrix> bucketized_features;
     bucketized_features.reserve(bucketized_features_list.size());
-    ConvertVectorsToMatrices(bucketized_features_list, bucketized_features);
+    ConvertVectorsToMatrices(context, bucketized_features_list,
+                             bucketized_features);
     const int batch_size = bucketized_features[0].dimension(0);
 
     // We need to get the feature ids used for splitting and the logits after
@@ -339,14 +375,16 @@ class BoostedTreesExampleDebugOutputsOp : public OpKernel {
     // features used to split and the associated logits at each point along the
     // path. Note: feature_ids has one less value than logits_path because the
     // first value of each logit path will be the bias.
-    auto do_work = [&resource, &bucketized_features, &output_debug_info,
-                    last_tree](int64_t start, int64_t end) {
+    auto do_work = [&context, &resource, &bucketized_features,
+                    &output_debug_info, last_tree](int64_t start, int64_t end) {
       for (int32_t i = start; i < end; ++i) {
         // Proto to store debug outputs, per example.
         boosted_trees::DebugOutput example_debug_info;
         // Initial bias prediction. E.g., prediction based off training mean.
         const auto& tree_logits = resource->node_value(0, 0);
-        DCHECK_EQ(tree_logits.size(), 1);
+        OP_REQUIRES(context, tree_logits.size() == 1,
+                    errors::Internal("Expected tree_logits.size() == 1, got ",
+                                     tree_logits.size()));
         float tree_logit = resource->GetTreeWeight(0) * tree_logits[0];
         example_debug_info.add_logits_path(tree_logit);
         int32_t node_id = 0;
@@ -372,7 +410,10 @@ class BoostedTreesExampleDebugOutputsOp : public OpKernel {
             node_id =
                 resource->next_node(tree_id, node_id, i, bucketized_features);
             const auto& tree_logits = resource->node_value(tree_id, node_id);
-            DCHECK_EQ(tree_logits.size(), 1);
+            OP_REQUIRES(
+                context, tree_logits.size() == 1,
+                errors::Internal("Expected tree_logits.size() == 1, got ",
+                                 tree_logits.size()));
             tree_logit = resource->GetTreeWeight(tree_id) * tree_logits[0];
             // Output logit incorporates sum of leaf logits from prior trees.
             example_debug_info.add_logits_path(tree_logit + past_trees_logit);
diff --git a/tensorflow/core/kernels/boosted_trees/quantile_ops.cc b/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
index acf926f684d024..f14ec5ee2c552c 100644
--- a/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
@@ -98,6 +98,9 @@ class BoostedTreesCreateQuantileStreamResourceOp : public OpKernel {
   explicit BoostedTreesCreateQuantileStreamResourceOp(
       OpKernelConstruction* const context)
       : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
     OP_REQUIRES_OK(context, context->GetAttr(kMaxElementsName, &max_elements_));
   }
 
@@ -108,6 +111,10 @@ class BoostedTreesCreateQuantileStreamResourceOp : public OpKernel {
     // disallowed.
     const Tensor* epsilon_t;
     OP_REQUIRES_OK(context, context->input(kEpsilonName, &epsilon_t));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(epsilon_t->shape()),
+                errors::InvalidArgument(
+                    "epsilon must be a scalar, got a tensor of shape ",
+                    epsilon_t->shape().DebugString()));
     float epsilon = epsilon_t->scalar<float>()();
     OP_REQUIRES(
         context, epsilon > 0,
@@ -115,6 +122,10 @@ class BoostedTreesCreateQuantileStreamResourceOp : public OpKernel {
 
     const Tensor* num_streams_t;
     OP_REQUIRES_OK(context, context->input(kNumStreamsName, &num_streams_t));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_streams_t->shape()),
+                errors::InvalidArgument(
+                    "num_streams must be a scalar, got a tensor of shape ",
+                    num_streams_t->shape().DebugString()));
     int64_t num_streams = num_streams_t->scalar<int64_t>()();
     OP_REQUIRES(context, num_streams >= 0,
                 errors::InvalidArgument(
@@ -143,6 +154,9 @@ class BoostedTreesMakeQuantileSummariesOp : public OpKernel {
   explicit BoostedTreesMakeQuantileSummariesOp(
       OpKernelConstruction* const context)
       : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
     OP_REQUIRES_OK(context, context->GetAttr(kNumFeaturesName, &num_features_));
   }
 
@@ -156,7 +170,8 @@ class BoostedTreesMakeQuantileSummariesOp : public OpKernel {
     const Tensor* example_weights_t;
     OP_REQUIRES_OK(context,
                    context->input(kExampleWeightsName, &example_weights_t));
-    DCHECK(float_features_list.size() > 0) << "Got empty feature list";
+    OP_REQUIRES(context, float_features_list.size() > 0,
+                errors::Internal("Got empty feature list"));
     auto example_weights = example_weights_t->flat<float>();
     const int64_t weight_size = example_weights.size();
     const int64_t batch_size = float_features_list[0].flat<float>().size();
@@ -166,6 +181,10 @@ class BoostedTreesMakeQuantileSummariesOp : public OpKernel {
             "Weights should be a single value or same size as features.")));
     const Tensor* epsilon_t;
     OP_REQUIRES_OK(context, context->input(kEpsilonName, &epsilon_t));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(epsilon_t->shape()),
+                errors::InvalidArgument(
+                    "epsilon must be a scalar, got a tensor of shape ",
+                    epsilon_t->shape().DebugString()));
     float epsilon = epsilon_t->scalar<float>()();
 
     OpOutputList summaries_output_list;
@@ -224,6 +243,9 @@ class BoostedTreesFlushQuantileSummariesOp : public OpKernel {
   explicit BoostedTreesFlushQuantileSummariesOp(
       OpKernelConstruction* const context)
       : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
     OP_REQUIRES_OK(context, context->GetAttr(kNumFeaturesName, &num_features_));
   }
 
@@ -284,7 +306,11 @@ class BoostedTreesQuantileStreamResourceAddSummariesOp : public OpKernel {
  public:
   explicit BoostedTreesQuantileStreamResourceAddSummariesOp(
       OpKernelConstruction* const context)
-      : OpKernel(context) {}
+      : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
+  }
 
   void Compute(OpKernelContext* context) override {
     ResourceHandle handle;
@@ -299,8 +325,11 @@ class BoostedTreesQuantileStreamResourceAddSummariesOp : public OpKernel {
     OpInputList summaries_list;
     OP_REQUIRES_OK(context,
                    context->input_list(kSummariesName, &summaries_list));
-    int32_t num_streams = stream_resource->num_streams();
-    CHECK_EQ(static_cast<int>(num_streams), summaries_list.size());
+    auto num_streams = stream_resource->num_streams();
+    OP_REQUIRES(
+        context, num_streams == summaries_list.size(),
+        errors::Internal("Expected num_streams == summaries_list.size(), got ",
+                         num_streams, " vs ", summaries_list.size()));
 
     auto do_quantile_add_summary = [&](const int64_t begin, const int64_t end) {
       // Iterating all features.
@@ -315,7 +344,10 @@ class BoostedTreesQuantileStreamResourceAddSummariesOp : public OpKernel {
         const auto summary_values = summaries.matrix<float>();
         const auto& tensor_shape = summaries.shape();
         const int64_t entries_size = tensor_shape.dim_size(0);
-        CHECK_EQ(tensor_shape.dim_size(1), 4);
+        OP_REQUIRES(
+            context, tensor_shape.dim_size(1) == 4,
+            errors::Internal("Expected tensor_shape.dim_size(1) == 4, got ",
+                             tensor_shape.dim_size(1)));
         std::vector<QuantileSummaryEntry> summary_entries;
         summary_entries.reserve(entries_size);
         for (int64_t i = 0; i < entries_size; i++) {
@@ -348,6 +380,9 @@ class BoostedTreesQuantileStreamResourceDeserializeOp : public OpKernel {
   explicit BoostedTreesQuantileStreamResourceDeserializeOp(
       OpKernelConstruction* const context)
       : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
     OP_REQUIRES_OK(context, context->GetAttr(kNumStreamsName, &num_features_));
   }
 
@@ -367,6 +402,12 @@ class BoostedTreesQuantileStreamResourceDeserializeOp : public OpKernel {
       // Iterating over all streams.
       for (int64_t stream_idx = begin; stream_idx < end; stream_idx++) {
         const Tensor& bucket_boundaries_t = bucket_boundaries_list[stream_idx];
+        OP_REQUIRES(
+            context, TensorShapeUtils::IsVector(bucket_boundaries_t.shape()),
+            errors::InvalidArgument("bucket boundaries for each stream must be "
+                                    "a vector, received shape ",
+                                    bucket_boundaries_t.shape().DebugString(),
+                                    " for stream ", stream_idx));
         const auto& bucket_boundaries = bucket_boundaries_t.vec<float>();
         std::vector<float> result;
         result.reserve(bucket_boundaries.size());
@@ -398,6 +439,9 @@ class BoostedTreesQuantileStreamResourceFlushOp : public OpKernel {
   explicit BoostedTreesQuantileStreamResourceFlushOp(
       OpKernelConstruction* const context)
       : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
     OP_REQUIRES_OK(context,
                    context->GetAttr(kGenerateQuantiles, &generate_quantiles_));
   }
@@ -414,6 +458,10 @@ class BoostedTreesQuantileStreamResourceFlushOp : public OpKernel {
 
     const Tensor* num_buckets_t;
     OP_REQUIRES_OK(context, context->input(kNumBucketsName, &num_buckets_t));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(num_buckets_t->shape()),
+                errors::InvalidArgument(
+                    "num_buckets must be a scalar, got a tensor of shape ",
+                    num_buckets_t->shape().DebugString()));
     const int64_t num_buckets = num_buckets_t->scalar<int64_t>()();
     const int64_t num_streams = stream_resource->num_streams();
 
@@ -454,6 +502,9 @@ class BoostedTreesQuantileStreamResourceGetBucketBoundariesOp
   explicit BoostedTreesQuantileStreamResourceGetBucketBoundariesOp(
       OpKernelConstruction* const context)
       : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
     OP_REQUIRES_OK(context, context->GetAttr(kNumFeaturesName, &num_features_));
   }
 
@@ -468,7 +519,9 @@ class BoostedTreesQuantileStreamResourceGetBucketBoundariesOp
     mutex_lock l(*stream_resource->mutex());
 
     const int64_t num_streams = stream_resource->num_streams();
-    CHECK_EQ(num_features_, num_streams);
+    OP_REQUIRES(context, num_streams == num_features_,
+                errors::Internal("Expected num_streams == num_features_, got ",
+                                 num_streams, " vs ", num_features_));
     OpOutputList bucket_boundaries_list;
     OP_REQUIRES_OK(context, context->output_list(kBucketBoundariesName,
                                                  &bucket_boundaries_list));
@@ -512,6 +565,9 @@ class BoostedTreesBucketizeOp : public OpKernel {
  public:
   explicit BoostedTreesBucketizeOp(OpKernelConstruction* const context)
       : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
     OP_REQUIRES_OK(context, context->GetAttr(kNumFeaturesName, &num_features_));
   }
 
diff --git a/tensorflow/core/kernels/boosted_trees/resource_ops.cc b/tensorflow/core/kernels/boosted_trees/resource_ops.cc
index 435c7d2880e2f5..f9e1a9a01ddcd3 100644
--- a/tensorflow/core/kernels/boosted_trees/resource_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/resource_ops.cc
@@ -36,18 +36,32 @@ REGISTER_KERNEL_BUILDER(
 class BoostedTreesCreateEnsembleOp : public OpKernel {
  public:
   explicit BoostedTreesCreateEnsembleOp(OpKernelConstruction* context)
-      : OpKernel(context) {}
+      : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
+  }
 
   void Compute(OpKernelContext* context) override {
     // Get the stamp token.
     const Tensor* stamp_token_t;
     OP_REQUIRES_OK(context, context->input("stamp_token", &stamp_token_t));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(stamp_token_t->shape()),
+                errors::InvalidArgument(
+                    "stamp_token must be a scalar, got a tensor of shape ",
+                    stamp_token_t->shape().DebugString()));
     int64_t stamp_token = stamp_token_t->scalar<int64_t>()();
 
     // Get the tree ensemble proto.
     const Tensor* tree_ensemble_serialized_t;
     OP_REQUIRES_OK(context, context->input("tree_ensemble_serialized",
                                            &tree_ensemble_serialized_t));
+    OP_REQUIRES(
+        context,
+        TensorShapeUtils::IsScalar(tree_ensemble_serialized_t->shape()),
+        errors::InvalidArgument(
+            "tree_ensemble_serialized must be a scalar, got a tensor of shape ",
+            tree_ensemble_serialized_t->shape().DebugString()));
     std::unique_ptr<BoostedTreesEnsembleResource> result(
         new BoostedTreesEnsembleResource());
     if (!result->InitFromSerialized(
@@ -76,7 +90,11 @@ REGISTER_KERNEL_BUILDER(Name("BoostedTreesCreateEnsemble").Device(DEVICE_CPU),
 class BoostedTreesGetEnsembleStatesOp : public OpKernel {
  public:
   explicit BoostedTreesGetEnsembleStatesOp(OpKernelConstruction* context)
-      : OpKernel(context) {}
+      : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
+  }
 
   void Compute(OpKernelContext* context) override {
     // Looks up the resource.
@@ -139,7 +157,11 @@ REGISTER_KERNEL_BUILDER(
 class BoostedTreesSerializeEnsembleOp : public OpKernel {
  public:
   explicit BoostedTreesSerializeEnsembleOp(OpKernelConstruction* context)
-      : OpKernel(context) {}
+      : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
+  }
 
   void Compute(OpKernelContext* context) override {
     core::RefCountPtr<BoostedTreesEnsembleResource> tree_ensemble_resource;
@@ -166,7 +188,11 @@ REGISTER_KERNEL_BUILDER(
 class BoostedTreesDeserializeEnsembleOp : public OpKernel {
  public:
   explicit BoostedTreesDeserializeEnsembleOp(OpKernelConstruction* context)
-      : OpKernel(context) {}
+      : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
+  }
 
   void Compute(OpKernelContext* context) override {
     core::RefCountPtr<BoostedTreesEnsembleResource> tree_ensemble_resource;
@@ -177,12 +203,22 @@ class BoostedTreesDeserializeEnsembleOp : public OpKernel {
     // Get the stamp token.
     const Tensor* stamp_token_t;
     OP_REQUIRES_OK(context, context->input("stamp_token", &stamp_token_t));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(stamp_token_t->shape()),
+                errors::InvalidArgument(
+                    "stamp_token must be a scalar, got a tensor of shape ",
+                    stamp_token_t->shape().DebugString()));
     int64_t stamp_token = stamp_token_t->scalar<int64_t>()();
 
     // Get the tree ensemble proto.
     const Tensor* tree_ensemble_serialized_t;
     OP_REQUIRES_OK(context, context->input("tree_ensemble_serialized",
                                            &tree_ensemble_serialized_t));
+    OP_REQUIRES(
+        context,
+        TensorShapeUtils::IsScalar(tree_ensemble_serialized_t->shape()),
+        errors::InvalidArgument(
+            "tree_ensemble_serialized must be a scalar, got a tensor of shape ",
+            tree_ensemble_serialized_t->shape().DebugString()));
     // Deallocate all the previous objects on the resource.
     tree_ensemble_resource->Reset();
     OP_REQUIRES(
diff --git a/tensorflow/core/kernels/boosted_trees/stats_ops.cc b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
index 4583d4554c062a..103e4b201587d8 100644
--- a/tensorflow/core/kernels/boosted_trees/stats_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
@@ -45,6 +45,9 @@ class BoostedTreesCalculateBestGainsPerFeatureOp : public OpKernel {
   explicit BoostedTreesCalculateBestGainsPerFeatureOp(
       OpKernelConstruction* const context)
       : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
     OP_REQUIRES_OK(context, context->GetAttr("max_splits", &max_splits_));
     OP_REQUIRES_OK(context, context->GetAttr("num_features", &num_features_));
   }
@@ -267,6 +270,9 @@ class BoostedTreesCalculateBestFeatureSplitOp : public OpKernel {
   explicit BoostedTreesCalculateBestFeatureSplitOp(
       OpKernelConstruction* const context)
       : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
     OP_REQUIRES_OK(context, context->GetAttr("logits_dimension", &logits_dim_));
     OP_REQUIRES_OK(context, context->GetAttr("split_type", &split_type_));
   }
@@ -622,6 +628,9 @@ class BoostedTreesCalculateBestFeatureSplitV2 : public OpKernel {
   explicit BoostedTreesCalculateBestFeatureSplitV2(
       OpKernelConstruction* const context)
       : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
     OP_REQUIRES_OK(context, context->GetAttr("logits_dimension", &logits_dim_));
     OP_REQUIRES_OK(context, context->GetAttr("num_features", &num_features_));
   }
@@ -1076,6 +1085,9 @@ class BoostedTreesSparseCalculateBestFeatureSplitOp : public OpKernel {
   explicit BoostedTreesSparseCalculateBestFeatureSplitOp(
       OpKernelConstruction* const context)
       : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
     // TODO(crawles): Using logits_dim_ for multi-class split.
     OP_REQUIRES_OK(context, context->GetAttr("logits_dimension", &logits_dim_));
     // TODO(tanzheny): Using this for equality split.
@@ -1424,6 +1436,9 @@ class BoostedTreesMakeStatsSummaryOp : public OpKernel {
  public:
   explicit BoostedTreesMakeStatsSummaryOp(OpKernelConstruction* const context)
       : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
     OP_REQUIRES_OK(context, context->GetAttr("max_splits", &max_splits_));
     OP_REQUIRES_OK(context, context->GetAttr("num_buckets", &num_buckets_));
     OP_REQUIRES_OK(context, context->GetAttr("num_features", &num_features_));
@@ -1521,6 +1536,9 @@ class BoostedTreesAggregateStatsOp : public OpKernel {
  public:
   explicit BoostedTreesAggregateStatsOp(OpKernelConstruction* const context)
       : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
     OP_REQUIRES_OK(context, context->GetAttr("max_splits", &max_splits_));
     OP_REQUIRES_OK(context, context->GetAttr("num_buckets", &num_buckets_));
   }
@@ -1758,6 +1776,9 @@ class BoostedTreesSparseAggregateStatsOp : public OpKernel {
   explicit BoostedTreesSparseAggregateStatsOp(
       OpKernelConstruction* const context)
       : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
     OP_REQUIRES_OK(context, context->GetAttr("max_splits", &max_splits_));
     OP_REQUIRES_OK(context, context->GetAttr("num_buckets", &num_buckets_));
   }
diff --git a/tensorflow/core/kernels/boosted_trees/training_ops.cc b/tensorflow/core/kernels/boosted_trees/training_ops.cc
index 3115d1056c3965..8a10741ae5f707 100644
--- a/tensorflow/core/kernels/boosted_trees/training_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/training_ops.cc
@@ -35,6 +35,9 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
  public:
   explicit BoostedTreesUpdateEnsembleOp(OpKernelConstruction* const context)
       : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
     OP_REQUIRES_OK(context, context->GetAttr("num_features", &num_features_));
 
     int32_t pruning_index;
@@ -68,14 +71,26 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
 
     const Tensor* feature_ids_t;
     OP_REQUIRES_OK(context, context->input("feature_ids", &feature_ids_t));
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsVector(feature_ids_t->shape()),
+        errors::InvalidArgument("feature_ids must be a vector, received shape ",
+                                feature_ids_t->shape().DebugString()));
     const auto feature_ids = feature_ids_t->vec<int32>();
 
     const Tensor* max_depth_t;
     OP_REQUIRES_OK(context, context->input("max_depth", &max_depth_t));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(max_depth_t->shape()),
+                errors::InvalidArgument(
+                    "max_depth must be a scalar, got a tensor of shape ",
+                    max_depth_t->shape().DebugString()));
     const auto max_depth = max_depth_t->scalar<int32>()();
 
     const Tensor* learning_rate_t;
     OP_REQUIRES_OK(context, context->input("learning_rate", &learning_rate_t));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(learning_rate_t->shape()),
+                errors::InvalidArgument(
+                    "learning_rate must be a scalar, got a tensor of shape ",
+                    learning_rate_t->shape().DebugString()));
     const auto learning_rate = learning_rate_t->scalar<float>()();
     // Op does not support multi-class, the V2 op below does however.
     int32_t logits_dimension = 1;
@@ -176,11 +191,50 @@ class BoostedTreesUpdateEnsembleOp : public OpKernel {
       std::map<int32, boosted_trees::SplitCandidate>* best_split_per_node) {
     // Find best split per node going through every feature candidate.
     for (int64_t feature_idx = 0; feature_idx < num_features_; ++feature_idx) {
+      OP_REQUIRES(
+          context,
+          TensorShapeUtils::IsVector(node_ids_list[feature_idx].shape()),
+          errors::InvalidArgument(
+              "Each node_id in node_ids_list must be a vector, received shape ",
+              node_ids_list[feature_idx].shape().DebugString(), " at index ",
+              feature_idx));
       const auto& node_ids = node_ids_list[feature_idx].vec<int32>();
+      OP_REQUIRES(
+          context, TensorShapeUtils::IsVector(gains_list[feature_idx].shape()),
+          errors::InvalidArgument(
+              "Each gain in gains_list must be a vector, received shape ",
+              gains_list[feature_idx].shape().DebugString(), " at index ",
+              feature_idx));
       const auto& gains = gains_list[feature_idx].vec<float>();
+      OP_REQUIRES(
+          context,
+          TensorShapeUtils::IsVector(thresholds_list[feature_idx].shape()),
+          errors::InvalidArgument(
+              "Each threshold in thresholds_list must be a vector, received "
+              "shape ",
+              thresholds_list[feature_idx].shape().DebugString(), " at index ",
+              feature_idx));
       const auto& thresholds = thresholds_list[feature_idx].vec<int32>();
+      OP_REQUIRES(
+          context,
+          TensorShapeUtils::IsMatrix(
+              left_node_contribs_list[feature_idx].shape()),
+          errors::InvalidArgument(
+              "Each left_node_contribs in left_node_contribs_list must be a "
+              "matrix, received shape ",
+              left_node_contribs_list[feature_idx].shape().DebugString(),
+              " at index ", feature_idx));
       const auto& left_node_contribs =
           left_node_contribs_list[feature_idx].matrix<float>();
+      OP_REQUIRES(
+          context,
+          TensorShapeUtils::IsMatrix(
+              right_node_contribs_list[feature_idx].shape()),
+          errors::InvalidArgument(
+              "Each right_node_contribs in right_node_contribs_list must be a "
+              "matrix, received shape ",
+              right_node_contribs_list[feature_idx].shape().DebugString(),
+              " at index ", feature_idx));
       const auto& right_node_contribs =
           right_node_contribs_list[feature_idx].matrix<float>();
 
@@ -234,6 +288,9 @@ class BoostedTreesUpdateEnsembleV2Op : public OpKernel {
  public:
   explicit BoostedTreesUpdateEnsembleV2Op(OpKernelConstruction* const context)
       : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
     OP_REQUIRES_OK(context, context->GetAttr("logits_dimension", &logits_dim_));
     OP_REQUIRES_OK(context, context->GetAttr("num_groups", &num_groups_));
   }
@@ -274,14 +331,26 @@ class BoostedTreesUpdateEnsembleV2Op : public OpKernel {
 
     const Tensor* max_depth_t;
     OP_REQUIRES_OK(context, context->input("max_depth", &max_depth_t));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(max_depth_t->shape()),
+                errors::InvalidArgument(
+                    "max_depth must be a scalar, got a tensor of shape ",
+                    max_depth_t->shape().DebugString()));
     const auto max_depth = max_depth_t->scalar<int32>()();
 
     const Tensor* learning_rate_t;
     OP_REQUIRES_OK(context, context->input("learning_rate", &learning_rate_t));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(learning_rate_t->shape()),
+                errors::InvalidArgument(
+                    "learning_rate must be a scalar, got a tensor of shape ",
+                    learning_rate_t->shape().DebugString()));
     const auto learning_rate = learning_rate_t->scalar<float>()();
 
     const Tensor* pruning_mode_t;
     OP_REQUIRES_OK(context, context->input("pruning_mode", &pruning_mode_t));
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(pruning_mode_t->shape()),
+                errors::InvalidArgument(
+                    "pruning_mode must be a scalar, got a tensor of shape ",
+                    pruning_mode_t->shape().DebugString()));
     const auto pruning_mode =
         static_cast<PruningMode>(pruning_mode_t->scalar<int32>()());
     // Find best splits for each active node.
@@ -327,7 +396,7 @@ class BoostedTreesUpdateEnsembleV2Op : public OpKernel {
       boosted_trees::SplitTypeWithDefault split_type_with_default;
       bool parsed = boosted_trees::SplitTypeWithDefault_Parse(
           split_type, &split_type_with_default);
-      DCHECK(parsed);
+      OP_REQUIRES(context, parsed, errors::Internal("Parse failed"));
       if (split_type_with_default == boosted_trees::EQUALITY_DEFAULT_RIGHT) {
         // Add equality split to the node.
         ensemble_resource->AddCategoricalSplitNode(current_tree, split_entry,
@@ -396,15 +465,75 @@ class BoostedTreesUpdateEnsembleV2Op : public OpKernel {
       std::map<int32, boosted_trees::SplitCandidate>* best_split_per_node) {
     // Find best split per node going through every feature candidate.
     for (int64_t group_idx = 0; group_idx < num_groups_; ++group_idx) {
+      OP_REQUIRES(
+          context, TensorShapeUtils::IsVector(node_ids_list[group_idx].shape()),
+          errors::InvalidArgument(
+              "Each node_id in node_ids_list must be a vector, received shape ",
+              node_ids_list[group_idx].shape().DebugString(), " at index ",
+              group_idx));
       const auto& node_ids = node_ids_list[group_idx].vec<int32>();
+      OP_REQUIRES(
+          context, TensorShapeUtils::IsVector(gains_list[group_idx].shape()),
+          errors::InvalidArgument(
+              "Each gain in gains_list must be a vector, received shape ",
+              gains_list[group_idx].shape().DebugString(), " at index ",
+              group_idx));
       const auto& gains = gains_list[group_idx].vec<float>();
+      OP_REQUIRES(
+          context,
+          TensorShapeUtils::IsVector(feature_ids_list[group_idx].shape()),
+          errors::InvalidArgument(
+              "Each feature_id in feature_ids_lists must be a vector, received "
+              "shape ",
+              feature_ids_list[group_idx].shape().DebugString(), " at index ",
+              group_idx));
       const auto& feature_ids = feature_ids_list[group_idx].vec<int32>();
+      OP_REQUIRES(
+          context,
+          TensorShapeUtils::IsVector(thresholds_list[group_idx].shape()),
+          errors::InvalidArgument(
+              "Each threshold in thresholds_list must be a vector, received "
+              "shape ",
+              thresholds_list[group_idx].shape().DebugString(), " at index ",
+              group_idx));
       const auto& thresholds = thresholds_list[group_idx].vec<int32>();
+      OP_REQUIRES(
+          context,
+          TensorShapeUtils::IsVector(dimension_ids_list[group_idx].shape()),
+          errors::InvalidArgument(
+              "Each dimension_id in dimension_ids_list must be a vector, "
+              "received shape ",
+              dimension_ids_list[group_idx].shape().DebugString(), " at index ",
+              group_idx));
       const auto& dimension_ids = dimension_ids_list[group_idx].vec<int32>();
+      OP_REQUIRES(context,
+                  TensorShapeUtils::IsMatrix(
+                      left_node_contribs_list[group_idx].shape()),
+                  errors::InvalidArgument(
+                      "Each left_node_contribs in right_node_contribs_list "
+                      "must be a matrix, received shape ",
+                      left_node_contribs_list[group_idx].shape().DebugString(),
+                      " at index ", group_idx));
       const auto& left_node_contribs =
           left_node_contribs_list[group_idx].matrix<float>();
+      OP_REQUIRES(context,
+                  TensorShapeUtils::IsMatrix(
+                      right_node_contribs_list[group_idx].shape()),
+                  errors::InvalidArgument(
+                      "Each right_node_contribs in right_node_contribs_list "
+                      "must be a matrix, received shape ",
+                      right_node_contribs_list[group_idx].shape().DebugString(),
+                      " at index ", group_idx));
       const auto& right_node_contribs =
           right_node_contribs_list[group_idx].matrix<float>();
+      OP_REQUIRES(
+          context,
+          TensorShapeUtils::IsVector(split_types_list[group_idx].shape()),
+          errors::InvalidArgument(
+              "Each split_type in split_types_list must be a vector, received "
+              "shape ",
+              split_types_list[group_idx].shape().DebugString(), " at index ",
+              group_idx));
       const auto& split_types = split_types_list[group_idx].vec<tstring>();
 
       for (size_t candidate_idx = 0; candidate_idx < node_ids.size();
@@ -457,7 +586,11 @@ REGISTER_KERNEL_BUILDER(Name("BoostedTreesUpdateEnsembleV2").Device(DEVICE_CPU),
 class BoostedTreesCenterBiasOp : public OpKernel {
  public:
   explicit BoostedTreesCenterBiasOp(OpKernelConstruction* const context)
-      : OpKernel(context) {}
+      : OpKernel(context) {
+    VLOG(1) << "Boosted Trees kernels in TF are deprecated. Please use "
+            << "TensorFlow Decision Forests instead "
+            << "(https://github.com/tensorflow/decision-forests).\n";
+  }
 
   void Compute(OpKernelContext* const context) override {
     // Get decision tree ensemble.
@@ -479,9 +612,17 @@ class BoostedTreesCenterBiasOp : public OpKernel {
     // Get the regularization options.
     const Tensor* l1_t;
     OP_REQUIRES_OK(context, context->input("l1", &l1_t));
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(l1_t->shape()),
+        errors::InvalidArgument("l1 must be a scalar, got a tensor of shape ",
+                                l1_t->shape().DebugString()));
     const auto l1 = l1_t->scalar<float>()();
     const Tensor* l2_t;
     OP_REQUIRES_OK(context, context->input("l2", &l2_t));
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(l2_t->shape()),
+        errors::InvalidArgument("l2 must be a scalar, got a tensor of shape ",
+                                l2_t->shape().DebugString()));
     const auto l2 = l2_t->scalar<float>()();
 
     // For now, assume 1-dimensional weight on leaves.
@@ -489,7 +630,8 @@ class BoostedTreesCenterBiasOp : public OpKernel {
     float unused_gain;
 
     // TODO(crawles): Support multiclass.
-    DCHECK_EQ(logits_dim, 1);
+    OP_REQUIRES(context, logits_dim == 1,
+                errors::Internal("Expected logits_dim == 1, got ", logits_dim));
     Eigen::VectorXf gradients_mean(1);
     Eigen::VectorXf hessians_mean(1);
     gradients_mean[0] = mean_gradients_t->flat<float>()(0);
@@ -506,7 +648,9 @@ class BoostedTreesCenterBiasOp : public OpKernel {
       current_bias = logits;
     } else {
       const auto& current_biases = ensemble_resource->node_value(0, 0);
-      DCHECK_EQ(current_biases.size(), 1);
+      OP_REQUIRES(context, current_biases.size() == 1,
+                  errors::Internal("Expected current_biases.size() == 1, got ",
+                                   current_biases.size()));
       current_bias = current_biases[0];
       continue_centering =
           std::abs(logits / current_bias) > kMinDeltaForCenterBias;
diff --git a/tensorflow/core/kernels/broadcast_to_op.cc b/tensorflow/core/kernels/broadcast_to_op.cc
index db6f1871f471ef..df6e7226ac5677 100644
--- a/tensorflow/core/kernels/broadcast_to_op.cc
+++ b/tensorflow/core/kernels/broadcast_to_op.cc
@@ -63,10 +63,6 @@ class BroadcastToOp : public OpKernel {
 
     Tensor* output_tensor = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_tensor));
-    // Handle empty case.
-    if (output_shape.num_elements() == 0) {
-      return;
-    }
 
     // Handle broadcast from Scalar.
     const Device& device = ctx->eigen_device<Device>();
@@ -76,6 +72,7 @@ class BroadcastToOp : public OpKernel {
       return;
     }
 
+    // Check whether the broadcast is valid.
     BCast bcast(BCast::FromShape(input_shape), BCast::FromShape(output_shape),
                 /*fewer_dims_optimization=*/true);
     OP_REQUIRES(ctx, bcast.IsValid(),
@@ -87,6 +84,11 @@ class BroadcastToOp : public OpKernel {
                                         input_shape, " to tensor of shape ",
                                         output_shape));
 
+    // Handle empty case.
+    if (output_shape.num_elements() == 0) {
+      return;
+    }
+
     functor::BroadcastTo<Device, T>()(device, ctx, *output_tensor, output_shape,
                                       input_tensor, input_shape, bcast);
   }
diff --git a/tensorflow/core/kernels/checkpoint_callback_manager.cc b/tensorflow/core/kernels/checkpoint_callback_manager.cc
new file mode 100644
index 00000000000000..0e1d4d0c4581a8
--- /dev/null
+++ b/tensorflow/core/kernels/checkpoint_callback_manager.cc
@@ -0,0 +1,184 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/checkpoint_callback_manager.h"
+
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/regexp.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace checkpoint {
+
+const absl::string_view kCheckpointCallbackManagerResourceName =
+    "checkpoint_callback_manager";
+
+namespace {
+
+const absl::string_view kCheckpointFileRegex = "^part-[0-9]*-of-[0-9]*$";
+const absl::string_view kCheckpointTempDirRegex = "-[0-9]*_temp$";
+const absl::string_view kCheckpointDirRegex = "-[0-9]*$";
+const absl::string_view kCheckpointTempDirSuffix = "_temp";
+
+}  // namespace
+
+//  Examples:
+//    "/foo/bar/checkpoint-1_temp/part-00000-of-00001" -->
+//        ("checkpoint-1", "/foo/bar");
+//    "/foo/bar/checkpoint-2/part-00000-of-00001" -->
+//        ("checkpoint-2", "/foo/bar");
+//    "/foo/bar/checkpoint-3" --> ("checkpoint-3", "/foo/bar");
+//    "/foo/bar"              --> NotFound error
+StatusOr<std::pair<std::string, std::string>>
+CheckpointCallbackManager::GetCheckpointIdAndPathFromPrefix(
+    absl::string_view prefix) {
+  for (absl::string_view path = prefix;; path = io::Dirname(path)) {
+    absl::string_view basename = io::Basename(path);
+
+    // Failed to find checkpoint_id
+    if (basename.empty()) break;
+
+    // Skip known checkpoint file: e.g., part-00000-of-00001
+    if (RE2::PartialMatch(basename, kCheckpointFileRegex)) continue;
+
+    // With _temp suffix: e.g., checkpoint-1_temp
+    if (RE2::PartialMatch(basename, kCheckpointTempDirRegex)) {
+      // Trim suffix, "_temp".
+      return std::make_pair(
+          std::string(basename.substr(
+              0, basename.length() - kCheckpointTempDirSuffix.length())),
+          std::string(io::Dirname(path)));
+    }
+
+    // Without _temp suffix: e.g., checkpoint-1
+    if (RE2::PartialMatch(basename, kCheckpointDirRegex)) {
+      return std::make_pair(std::string(basename),
+                            std::string(io::Dirname(path)));
+    }
+  }
+  return errors::NotFound(
+      absl::StrCat("Failed to find a checkpoint id. prefix = ", prefix));
+}
+
+Status CheckpointCallbackManager::RegisterSaveCallback(
+    absl::string_view file_extension, SaveCallback callback) {
+  return save_callbacks_.try_emplace(file_extension, std::move(callback)).second
+             ? Status::OK()
+             : errors::AlreadyExists("A callback already exists.");
+}
+
+bool CheckpointCallbackManager::DoesSaveCallbackExist(
+    absl::string_view file_extension) const {
+  return save_callbacks_.contains(file_extension);
+}
+
+Status CheckpointCallbackManager::RegisterRestoreCallback(
+    absl::string_view file_extension, RestoreCallback callback) {
+  return restore_callbacks_.try_emplace(file_extension, std::move(callback))
+                 .second
+             ? Status::OK()
+             : errors::AlreadyExists("A callback already exists.");
+}
+
+bool CheckpointCallbackManager::DoesRestoreCallbackExist(
+    absl::string_view file_extension) const {
+  return restore_callbacks_.contains(file_extension);
+}
+
+void CheckpointCallbackManager::Save(absl::string_view prefix) {
+  StatusOr<std::pair<std::string, std::string>> id_and_dir =
+      GetCheckpointIdAndPathFromPrefix(prefix);
+  if (!id_and_dir.ok()) {
+    LOG(WARNING) << id_and_dir.status();
+    return;
+  }
+
+  for (const auto& name_and_callback : save_callbacks_) {
+    const std::string file_path = io::JoinPath(
+        id_and_dir->second,
+        absl::StrCat(id_and_dir->first, ".", name_and_callback.first));
+
+    // If the file already exists, we are done.
+    if (Env::Default()->FileExists(file_path).ok()) {
+      continue;
+    }
+
+    LOG(INFO) << "Calling a save callback: file_extension = "
+              << name_and_callback.first
+              << ", checkpoint_id = " << id_and_dir->first;
+    // The callback should return a string to store.
+    StatusOr<std::string> save_content =
+        name_and_callback.second(id_and_dir->first);
+    if (!save_content.ok()) {
+      LOG(WARNING) << save_content.status();
+      continue;
+    }
+
+    Status write_status =
+        WriteStringToFile(Env::Default(), file_path, *save_content);
+    if (!write_status.ok()) {
+      LOG(WARNING) << write_status;
+    } else {
+      LOG(INFO) << "A CheckpointCallbackManager has been written to "
+                << file_path;
+    }
+  }
+}
+
+void CheckpointCallbackManager::Restore(absl::string_view prefix) {
+  StatusOr<std::pair<std::string, std::string>> id_and_dir =
+      GetCheckpointIdAndPathFromPrefix(prefix);
+  if (!id_and_dir.ok()) {
+    LOG(WARNING) << id_and_dir.status();
+    return;
+  }
+
+  for (const auto& name_and_callback : restore_callbacks_) {
+    const std::string file_path = io::JoinPath(
+        id_and_dir->second,
+        absl::StrCat(id_and_dir->first, ".", name_and_callback.first));
+    if (!Env::Default()->FileExists(file_path).ok()) {
+      continue;
+    }
+    std::string payload;
+    Status read_status = ReadFileToString(Env::Default(), file_path, &payload);
+    if (!read_status.ok()) {
+      LOG(WARNING) << "Failed to read: " << read_status;
+      continue;
+    }
+
+    LOG(INFO) << "Calling a restore callback: file_extension = "
+              << name_and_callback.first
+              << ", checkpoint_id = " << id_and_dir->first;
+    Status callback_status =
+        name_and_callback.second(id_and_dir->first, payload);
+    if (!callback_status.ok()) {
+      LOG(WARNING) << callback_status;
+    }
+  }
+}
+
+}  // namespace checkpoint
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/checkpoint_callback_manager.h b/tensorflow/core/kernels/checkpoint_callback_manager.h
new file mode 100644
index 00000000000000..5d0669effc986c
--- /dev/null
+++ b/tensorflow/core/kernels/checkpoint_callback_manager.h
@@ -0,0 +1,98 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0(the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_CHECKPOINT_CALLBACK_MANAGER_H_
+#define TENSORFLOW_CORE_KERNELS_CHECKPOINT_CALLBACK_MANAGER_H_
+
+#include <functional>
+#include <string>
+#include <utility>
+
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/resource_base.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace checkpoint {
+
+ABSL_CONST_INIT extern const absl::string_view
+    kCheckpointCallbackManagerResourceName;
+
+// StatusOr<std::string> save_callback(absl::string_view checkpoint_id);
+using SaveCallback = std::function<StatusOr<std::string>(absl::string_view)>;
+
+// Status restore_callback(absl::string_view checkpoint_id,
+//                         absl::string_view content_from_checkpoint);
+using RestoreCallback =
+    std::function<Status(absl::string_view, absl::string_view)>;
+
+// A class to save and restore additional information for checkpointing.
+class CheckpointCallbackManager : public ResourceBase {
+ public:
+  CheckpointCallbackManager() = default;
+
+  // Not copyable or movable
+  CheckpointCallbackManager(const CheckpointCallbackManager&) = delete;
+  CheckpointCallbackManager& operator=(const CheckpointCallbackManager&) =
+      delete;
+
+  std::string DebugString() const override {
+    return "CheckpointCallbackManager";
+  }
+
+  // Infers a checkpoint id and directory from a prefix
+  // passed to SaveV2 / RestoreV2 Ops
+  static StatusOr<std::pair<std::string, std::string>>
+  GetCheckpointIdAndPathFromPrefix(absl::string_view prefix);
+
+  // Register a save callback.
+  // The passed callback will be triggered with an identified checkpoint id.
+  // The callback should return a string content needs to be stored
+  // as a part of a checkpoint, and then the content is stored as a file
+  // with the registered the file_extension.
+  Status RegisterSaveCallback(absl::string_view file_extension,
+                              SaveCallback callback);
+
+  // Checks if a registered save callback exists for an extension.
+  bool DoesSaveCallbackExist(absl::string_view file_extension) const;
+
+  // Register a restore callback.
+  // The passed file_extension is used to generate a file name together with
+  // an identified checkpoint_id. If the file exists, the registered callback
+  // is triggered with the content of the file.
+  Status RegisterRestoreCallback(absl::string_view file_extension,
+                                 RestoreCallback callback);
+
+  // Checks if a registered restore callback exists for an extension.
+  bool DoesRestoreCallbackExist(absl::string_view file_extension) const;
+
+  // Should be triggered from SaveV2()::Compute().
+  void Save(absl::string_view prefix);
+
+  // Should be triggered from RestoreV2()::Compute().
+  void Restore(absl::string_view prefix);
+
+ private:
+  absl::flat_hash_map<std::string, SaveCallback> save_callbacks_;
+  absl::flat_hash_map<std::string, RestoreCallback> restore_callbacks_;
+};
+
+}  // namespace checkpoint
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_CHECKPOINT_CALLBACK_MANAGER_H_
diff --git a/tensorflow/core/kernels/checkpoint_callback_manager_test.cc b/tensorflow/core/kernels/checkpoint_callback_manager_test.cc
new file mode 100644
index 00000000000000..4076dcd59e3c4d
--- /dev/null
+++ b/tensorflow/core/kernels/checkpoint_callback_manager_test.cc
@@ -0,0 +1,282 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0(the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/kernels/checkpoint_callback_manager.h"
+
+#include <string>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/resource_handle.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/managed_stack_trace.h"
+
+namespace tensorflow {
+namespace checkpoint {
+namespace {
+
+class CheckpointCallbackManagerTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    checkpoint_callback_manager_ = new CheckpointCallbackManager();
+    handle_ = ResourceHandle::MakeRefCountingHandle(
+        checkpoint_callback_manager_, "cpu", {}, {});
+  }
+
+  CheckpointCallbackManager* checkpoint_callback_manager_;
+  ResourceHandle handle_;
+};
+
+TEST_F(CheckpointCallbackManagerTest,
+       GetCheckpointIdAndPathFromPrefixWithTempDir) {
+  StatusOr<std::pair<std::string, std::string>> pair =
+      CheckpointCallbackManager::GetCheckpointIdAndPathFromPrefix(
+          "/foo/bar/model.ckpt-5_temp/part-00000-of-00001");
+  TF_ASSERT_OK(pair.status());
+  EXPECT_EQ(pair->first, "model.ckpt-5");
+  EXPECT_EQ(pair->second, "/foo/bar");
+}
+
+TEST_F(CheckpointCallbackManagerTest,
+       GetCheckpointIdAndPathFromPrefixWithPartFile) {
+  StatusOr<std::pair<std::string, std::string>> pair =
+      CheckpointCallbackManager::GetCheckpointIdAndPathFromPrefix(
+          "/foo/bar/model.ckpt-5/part-00000-of-00001");
+  TF_ASSERT_OK(pair.status());
+  EXPECT_EQ(pair->first, "model.ckpt-5");
+  EXPECT_EQ(pair->second, "/foo/bar");
+}
+
+TEST_F(CheckpointCallbackManagerTest,
+       GetCheckpointIdAndPathFromPrefixWithoutPartFile) {
+  StatusOr<std::pair<std::string, std::string>> pair =
+      CheckpointCallbackManager::GetCheckpointIdAndPathFromPrefix(
+          "/foo/bar/model.ckpt-5");
+  TF_ASSERT_OK(pair.status());
+  EXPECT_EQ(pair->first, "model.ckpt-5");
+  EXPECT_EQ(pair->second, "/foo/bar");
+}
+
+TEST_F(CheckpointCallbackManagerTest,
+       GetCheckpointIdAndPathFromPrefixUnrecognized) {
+  EXPECT_FALSE(
+      CheckpointCallbackManager::GetCheckpointIdAndPathFromPrefix("/foo/bar")
+          .ok());
+}
+
+TEST_F(CheckpointCallbackManagerTest, RegisterSaveCallbackTwice) {
+  SaveCallback first_callback = [](absl::string_view checkpoint_id) {
+    return std::string("MockString");
+  };
+
+  SaveCallback second_callback = [](absl::string_view checkpoint_id) {
+    return std::string("MockString");
+  };
+
+  TF_ASSERT_OK(checkpoint_callback_manager_->RegisterSaveCallback(
+      "foo", std::move(first_callback)));
+
+  EXPECT_FALSE(checkpoint_callback_manager_
+                   ->RegisterSaveCallback("foo", std::move(second_callback))
+                   .ok());
+}
+
+TEST_F(CheckpointCallbackManagerTest, RegisterRestoreCallbackTwice) {
+  RestoreCallback first_callback = [](absl::string_view checkpoint_id,
+                                      absl::string_view str) {
+    return Status::OK();
+  };
+  RestoreCallback second_callback = [](absl::string_view checkpoint_id,
+                                       absl::string_view str) {
+    return Status::OK();
+  };
+
+  TF_ASSERT_OK(checkpoint_callback_manager_->RegisterRestoreCallback(
+      "foo", std::move(first_callback)));
+
+  EXPECT_FALSE(checkpoint_callback_manager_
+                   ->RegisterRestoreCallback("foo", std::move(second_callback))
+                   .ok());
+}
+
+TEST_F(CheckpointCallbackManagerTest, DoesSaveCallbackExist) {
+  SaveCallback first_callback = [](absl::string_view checkpoint_id) {
+    return std::string("MockString");
+  };
+
+  SaveCallback second_callback = [](absl::string_view checkpoint_id) {
+    return std::string("MockString");
+  };
+
+  TF_ASSERT_OK(checkpoint_callback_manager_->RegisterSaveCallback(
+      "foo", std::move(first_callback)));
+
+  TF_ASSERT_OK(checkpoint_callback_manager_->RegisterSaveCallback(
+      "bar", std::move(second_callback)));
+
+  EXPECT_TRUE(checkpoint_callback_manager_->DoesSaveCallbackExist("foo"));
+  EXPECT_TRUE(checkpoint_callback_manager_->DoesSaveCallbackExist("bar"));
+  EXPECT_FALSE(
+      checkpoint_callback_manager_->DoesSaveCallbackExist("not_exist"));
+}
+
+TEST_F(CheckpointCallbackManagerTest, DoesRestoreCallbackExist) {
+  RestoreCallback first_callback = [](absl::string_view checkpoint_id,
+                                      absl::string_view str) {
+    return Status::OK();
+  };
+  RestoreCallback second_callback = [](absl::string_view checkpoint_id,
+                                       absl::string_view str) {
+    return Status::OK();
+  };
+
+  TF_ASSERT_OK(checkpoint_callback_manager_->RegisterRestoreCallback(
+      "foo", std::move(first_callback)));
+
+  TF_ASSERT_OK(checkpoint_callback_manager_->RegisterRestoreCallback(
+      "bar", std::move(second_callback)));
+
+  EXPECT_TRUE(checkpoint_callback_manager_->DoesRestoreCallbackExist("foo"));
+  EXPECT_TRUE(checkpoint_callback_manager_->DoesRestoreCallbackExist("bar"));
+  EXPECT_FALSE(
+      checkpoint_callback_manager_->DoesRestoreCallbackExist("not_exist"));
+}
+
+TEST_F(CheckpointCallbackManagerTest, SaveTwoCallbacks) {
+  SaveCallback save_callback1 = [](absl::string_view checkpoint_id) {
+    return absl::StrCat("MockContent1::", checkpoint_id);
+  };
+  SaveCallback save_callback2 = [](absl::string_view checkpoint_id) {
+    return absl::StrCat("MockContent2::", checkpoint_id);
+  };
+
+  TF_ASSERT_OK(checkpoint_callback_manager_->RegisterSaveCallback(
+      "foo", std::move(save_callback1)));
+
+  TF_ASSERT_OK(checkpoint_callback_manager_->RegisterSaveCallback(
+      "bar", std::move(save_callback2)));
+
+  checkpoint_callback_manager_->Save(io::JoinPath(
+      testing::TmpDir(), "model.ckpt-123_temp/part-00000-of-00001"));
+  std::string file_content1;
+  TF_EXPECT_OK(ReadFileToString(
+      Env::Default(), io::JoinPath(testing::TmpDir(), "model.ckpt-123.foo"),
+      &file_content1));
+  EXPECT_EQ(file_content1, "MockContent1::model.ckpt-123");
+
+  std::string file_content2;
+  TF_EXPECT_OK(ReadFileToString(
+      Env::Default(), io::JoinPath(testing::TmpDir(), "model.ckpt-123.bar"),
+      &file_content2));
+  EXPECT_EQ(file_content2, "MockContent2::model.ckpt-123");
+}
+
+TEST_F(CheckpointCallbackManagerTest, SaveMultipleTimes) {
+  SaveCallback save_callback = [](absl::string_view checkpoint_id) {
+    return absl::StrCat("MockContent::", checkpoint_id);
+  };
+
+  TF_ASSERT_OK(checkpoint_callback_manager_->RegisterSaveCallback(
+      "foo", std::move(save_callback)));
+
+  checkpoint_callback_manager_->Save(io::JoinPath(
+      testing::TmpDir(), "model.ckpt-100_temp/part-00000-of-00001"));
+
+  checkpoint_callback_manager_->Save(io::JoinPath(
+      testing::TmpDir(), "model.ckpt-100_temp/part-00000-of-00001"));
+
+  checkpoint_callback_manager_->Save(io::JoinPath(
+      testing::TmpDir(), "model.ckpt-200_temp/part-00000-of-00001"));
+
+  std::string file_content;
+  TF_EXPECT_OK(ReadFileToString(
+      Env::Default(), io::JoinPath(testing::TmpDir(), "model.ckpt-100.foo"),
+      &file_content));
+  EXPECT_EQ(file_content, "MockContent::model.ckpt-100");
+
+  TF_EXPECT_OK(ReadFileToString(
+      Env::Default(), io::JoinPath(testing::TmpDir(), "model.ckpt-200.foo"),
+      &file_content));
+  EXPECT_EQ(file_content, "MockContent::model.ckpt-200");
+}
+
+TEST_F(CheckpointCallbackManagerTest, Restore) {
+  int callback_call_count = 0;
+  RestoreCallback restore_callback = [&callback_call_count](
+                                         absl::string_view checkpoint_id,
+                                         absl::string_view str) {
+    EXPECT_EQ(checkpoint_id, "model.ckpt-100");
+    EXPECT_EQ(str, "Apple");
+    ++callback_call_count;
+    return Status::OK();
+  };
+
+  TF_ASSERT_OK(checkpoint_callback_manager_->RegisterRestoreCallback(
+      "foo", std::move(restore_callback)));
+
+  TF_EXPECT_OK(WriteStringToFile(
+      Env::Default(), io::JoinPath(testing::TmpDir(), "model.ckpt-100.foo"),
+      "Apple"));
+
+  EXPECT_EQ(callback_call_count, 0);
+  checkpoint_callback_manager_->Restore(
+      io::JoinPath(testing::TmpDir(), "model.ckpt-100"));
+  EXPECT_EQ(callback_call_count, 1);
+
+  checkpoint_callback_manager_->Restore(
+      io::JoinPath(testing::TmpDir(), "model.ckpt-100"));
+  EXPECT_EQ(callback_call_count, 2);
+}
+
+TEST_F(CheckpointCallbackManagerTest, SaveAndRestore) {
+  SaveCallback save_callback = [](absl::string_view checkpoint_id) {
+    return std::string("Apple");
+  };
+
+  TF_ASSERT_OK(checkpoint_callback_manager_->RegisterSaveCallback(
+      "foo", std::move(save_callback)));
+
+  int restore_callback_count = 0;
+  RestoreCallback restore_callback = [&restore_callback_count](
+                                         absl::string_view checkpoint_id,
+                                         absl::string_view str) {
+    EXPECT_EQ(checkpoint_id, "model.ckpt-500");
+    EXPECT_EQ(str, "Apple");
+    ++restore_callback_count;
+    return Status::OK();
+  };
+
+  TF_ASSERT_OK(checkpoint_callback_manager_->RegisterRestoreCallback(
+      "foo", std::move(restore_callback)));
+
+  checkpoint_callback_manager_->Save(io::JoinPath(
+      testing::TmpDir(), "model.ckpt-500_temp/part-00000-of-00001"));
+
+  EXPECT_EQ(restore_callback_count, 0);
+  checkpoint_callback_manager_->Restore(
+      io::JoinPath(testing::TmpDir(), "model.ckpt-500"));
+  EXPECT_EQ(restore_callback_count, 1);
+}
+
+}  // namespace
+}  // namespace checkpoint
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 859caab6061469..330337f79bac3c 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -240,8 +240,8 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/data:file_utils",
         "//tensorflow/core/data:name_utils",
+        "//tensorflow/core/data:utils",
     ],
 )
 
@@ -420,6 +420,7 @@ tf_kernel_library(
         "//tensorflow/core/data:root_dataset",
         "//tensorflow/core/data:serialization_utils",
         "//tensorflow/core/data:unbounded_thread_pool",
+        "//tensorflow/core/data:utils",
         "//tensorflow/core/kernels:ops_util",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/profiler/lib:traceme_encode",
@@ -1218,8 +1219,8 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/data:file_utils",
         "//tensorflow/core/data:name_utils",
+        "//tensorflow/core/data:utils",
     ],
 )
 
@@ -1252,8 +1253,8 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/data:file_utils",
         "//tensorflow/core/data:name_utils",
+        "//tensorflow/core/data:utils",
     ],
 )
 
@@ -1364,7 +1365,7 @@ filegroup(
     srcs = [
         "//tensorflow/core/data:captured_function.h",
         "//tensorflow/core/data:dataset_utils.h",
-        "//tensorflow/core/data:file_utils.h",
+        "//tensorflow/core/data:utils.h",
         "//tensorflow/core/data:name_utils.h",
         "//tensorflow/core/data:rewrite_utils.h",
         "//tensorflow/core/data:root_dataset.h",
@@ -1389,7 +1390,7 @@ filegroup(
         "//tensorflow/core/data:captured_function.cc",
         "//tensorflow/core/data:dataset_utils.cc",
         "//tensorflow/core/data:name_utils.cc",
-        "//tensorflow/core/data:file_utils.cc",
+        "//tensorflow/core/data:utils.cc",
         "//tensorflow/core/data:rewrite_utils.cc",
         "//tensorflow/core/data:root_dataset.cc",
         "//tensorflow/core/data:serialization_utils.cc",
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 234ed499b71cf9..46122d8c5f06b8 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -117,6 +117,14 @@ class BatchDatasetOp::Dataset : public DatasetBase {
     return n / batch_size_ + (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
   }
 
+  int64_t CardinalityInternal(CardinalityOptions options) const override {
+    int64_t n = input_->Cardinality(options);
+    if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+      return n;
+    }
+    return n / batch_size_ + (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
+  }
+
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
     return Status::OK();
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
index 32b008c003a331..7df213feabb844 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -97,6 +97,21 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
     return input_cardinality_ + to_concatenate_cardinality_;
   }
 
+  int64_t CardinalityInternal(CardinalityOptions options) const override {
+    int64_t input_cardinality = input_->Cardinality(options);
+    int64_t to_concatenate_cardinality = to_concatenate_->Cardinality(options);
+
+    if (input_cardinality == kInfiniteCardinality ||
+        to_concatenate_cardinality == kInfiniteCardinality) {
+      return kInfiniteCardinality;
+    }
+    if (input_cardinality == kUnknownCardinality ||
+        to_concatenate_cardinality == kUnknownCardinality) {
+      return kUnknownCardinality;
+    }
+    return input_cardinality + to_concatenate_cardinality;
+  }
+
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
     inputs->push_back(to_concatenate_);
diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
index e5ad7b007ed5c7..4478e4d885cbac 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
@@ -111,6 +111,41 @@ bool IsColocatedTask(const TaskInfo& task) {
     return absl::AsciiStrToUpper(worker_tag) == kColocatedWorkerTag;
   });
 }
+
+StatusOr<DataServiceMetadata> GetDataServiceMetadata(const int64_t dataset_id,
+                                                     const tstring& address,
+                                                     const tstring& protocol) {
+  DataServiceDispatcherClient client(address, protocol);
+  DataServiceMetadata metadata;
+  absl::Time deadline =
+      absl::FromUnixMicros(EnvTime::NowMicros()) + kGetMetadataRetryTimeout;
+
+  Status status = grpc_util::Retry(
+      [&]() { return client.GetDataServiceMetadata(dataset_id, metadata); },
+      absl::Substitute("Get data service metadata for dataset $0, "
+                       "with dispatcher at $1.",
+                       dataset_id, std::string(address)),
+      absl::ToUnixMicros(deadline));
+  if (errors::IsNotFound(status)) {
+    return errors::NotFound(
+        "Dataset id ", dataset_id,
+        " not found. It must be registered with `register_dataset` before "
+        "calling `from_dataset_id`.");
+  }
+  return metadata;
+}
+
+StatusOr<DataServiceMetadata::Compression> GetValidatedCompression(
+    int64_t dataset_id, const DataServiceMetadata& metadata) {
+  if (metadata.compression() == DataServiceMetadata::COMPRESSION_UNSPECIFIED) {
+    return errors::Internal(absl::Substitute(
+        "Got invalid compression $0 for dataset $1. A proper compression "
+        "should be registered in `register_dataset`.",
+        DataServiceMetadata::Compression_Name(metadata.compression()),
+        dataset_id));
+  }
+  return metadata.compression();
+}
 }  // namespace
 
 // Dataset for reading data from the tf.data service non-deterministically.
@@ -128,6 +163,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
           absl::optional<int64_t> num_consumers,
           int64_t max_outstanding_requests, int64_t task_refresh_interval_ms,
           const TargetWorkers target_workers,
+          const DataServiceMetadata& metadata,
           IterationCounter* iteration_counter, bool owns_resource,
           ResourceHandle iteration_counter_handle,
           std::unique_ptr<CapturedFunction> captured_uncompress_func,
@@ -147,6 +183,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
         max_outstanding_requests_(max_outstanding_requests),
         task_refresh_interval_ms_(task_refresh_interval_ms),
         target_workers_(target_workers),
+        metadata_(metadata),
         iteration_counter_(iteration_counter),
         owns_resource_(owns_resource),
         iteration_counter_handle_(iteration_counter_handle),
@@ -190,6 +227,25 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
       // Coordinated reads require the dataset to be infinite.
       return kInfiniteCardinality;
     }
+
+    if (metadata_.cardinality() == 0) {
+      return 0;
+    }
+
+    if (metadata_.cardinality() == kInfiniteCardinality) {
+      // Sharding may cause an infinite dataset to be empty. For example, in
+      // `range(10).batch(10, drop_remainder=True).repeat()`, inserting `shard`
+      // before `batch` will cause the dataset to be empty.
+      // This case is rare, and there is significant performance hit for dynamic
+      // sharding if it reports unknown cardinality, so it is reasonable to
+      // report infinite cardinality. For DATA sharding, it is ok to report
+      // infinite cardinality since it inserts `shard` after `repeat`.
+      if (processing_mode_.sharding_policy() == ProcessingModeDef::OFF ||
+          processing_mode_.sharding_policy() == ProcessingModeDef::DYNAMIC ||
+          processing_mode_.sharding_policy() == ProcessingModeDef::DATA) {
+        return kInfiniteCardinality;
+      }
+    }
     return kUnknownCardinality;
   }
 
@@ -1312,6 +1368,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
   const int64_t max_outstanding_requests_;
   const int64_t task_refresh_interval_ms_;
   const TargetWorkers target_workers_;
+  const DataServiceMetadata metadata_;
   IterationCounter* const iteration_counter_;  // Owned
   const bool owns_resource_;
   const ResourceHandle iteration_counter_handle_;
@@ -1468,12 +1525,21 @@ void DataServiceDatasetOp::MakeDataset(OpKernelContext* ctx,
       errors::InvalidArgument(kMaxOutstandingRequests, " must be positive or ",
                               model::kAutotune));
 
-  StatusOr<bool> should_uncompress =
-      ShouldUncompress(dataset_id, address, protocol);
-  OP_REQUIRES_OK(ctx, should_uncompress.status());
+  StatusOr<DataServiceMetadata> metadata =
+      GetDataServiceMetadata(dataset_id, address, protocol);
+  OP_REQUIRES_OK(ctx, metadata.status());
+
+  bool should_uncompress = op_version_ >= 3 && uncompress_;
+  if (should_uncompress) {
+    StatusOr<DataServiceMetadata::Compression> compression =
+        GetValidatedCompression(dataset_id, *metadata);
+    OP_REQUIRES_OK(ctx, compression.status());
+    should_uncompress =
+        should_uncompress && (*compression == DataServiceMetadata::SNAPPY);
+  }
   DataTypeVector data_service_output_types = output_types_;
   std::vector<PartialTensorShape> data_service_output_shapes = output_shapes_;
-  if (*should_uncompress) {
+  if (should_uncompress) {
     data_service_output_types = {DT_VARIANT};
     data_service_output_shapes = {TensorShape({})};
   }
@@ -1489,10 +1555,10 @@ void DataServiceDatasetOp::MakeDataset(OpKernelContext* ctx,
       ctx, op_version_, dataset_id, processing_mode, address, protocol,
       data_transfer_protocol_, job_name, consumer_index, num_consumers,
       max_outstanding_requests, task_refresh_interval_hint_ms_, target_workers_,
-      iteration_counter, owns_resource, iteration_counter_handle,
+      *metadata, iteration_counter, owns_resource, iteration_counter_handle,
       std::move(captured_uncompress_func), data_service_output_types,
       data_service_output_shapes);
-  if (*should_uncompress) {
+  if (should_uncompress) {
     VLOG(2) << "Inserting a ParallelMap dataset to uncompress tf.data service "
             << "dataset " << dataset_id << ".";
     captured_uncompress_func.reset();
@@ -1508,44 +1574,6 @@ void DataServiceDatasetOp::MakeDataset(OpKernelContext* ctx,
   *output = dataset;
 }
 
-StatusOr<bool> DataServiceDatasetOp::ShouldUncompress(
-    int64_t dataset_id, const tstring& address, const tstring& protocol) const {
-  if (op_version_ < 3 || !uncompress_) {
-    return false;
-  }
-
-  DataServiceDispatcherClient client(address, protocol);
-  DataServiceMetadata data_service_metadata;
-  absl::Time deadline =
-      absl::FromUnixMicros(EnvTime::NowMicros()) + kGetMetadataRetryTimeout;
-
-  Status status = grpc_util::Retry(
-      [&]() {
-        return client.GetDataServiceMetadata(dataset_id, data_service_metadata);
-      },
-      absl::Substitute("Get data service metadata for dataset $0, "
-                       "with dispatcher at $1.",
-                       dataset_id, std::string(address)),
-      absl::ToUnixMicros(deadline));
-  if (errors::IsNotFound(status)) {
-    return errors::NotFound(
-        "Dataset id ", dataset_id,
-        " not found. It must be registered `register_dataset` before calling "
-        "`from_dataset_id`.");
-  }
-
-  if (data_service_metadata.compression() ==
-      DataServiceMetadata::COMPRESSION_UNSPECIFIED) {
-    return errors::Internal(absl::Substitute(
-        "Got invalid compression $0 for dataset $1. A proper compression "
-        "should be registered in `register_dataset`.",
-        DataServiceMetadata::Compression_Name(
-            data_service_metadata.compression()),
-        dataset_id));
-  }
-  return data_service_metadata.compression() == DataServiceMetadata::SNAPPY;
-}
-
 REGISTER_KERNEL_BUILDER(Name(kDataServiceDatasetV1).Device(DEVICE_CPU),
                         DataServiceDatasetOp);
 REGISTER_KERNEL_BUILDER(Name(kDataServiceDatasetV2).Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.h b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.h
index aee7b237cf229a..30ec2949c4a298 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.h
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.h
@@ -86,12 +86,6 @@ class DataServiceDatasetOp : public DatasetOpKernel {
   void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
 
  private:
-  // Returns if the dataset should be uncompressed. This method sends an RPC to
-  // get the metadata from the dispatcher. If any error happens, it returns a
-  // non-OK status.
-  StatusOr<bool> ShouldUncompress(int64_t dataset_id, const tstring& address,
-                                  const tstring& protocol) const;
-
   class Dataset;
   int op_version_;
   int64_t task_refresh_interval_hint_ms_;
diff --git a/tensorflow/core/kernels/data/experimental/random_access_ops.cc b/tensorflow/core/kernels/data/experimental/random_access_ops.cc
index d0a45a4845be47..59575c99b12e63 100644
--- a/tensorflow/core/kernels/data/experimental/random_access_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/random_access_ops.cc
@@ -34,7 +34,9 @@ Status GetElementAtIndexOp::DoCompute(OpKernelContext* ctx) {
   DatasetBase* finalized_dataset;
   TF_ASSIGN_OR_RETURN(finalized_dataset, GetFinalizedDataset(ctx, dataset));
 
-  int64_t cardinality = finalized_dataset->Cardinality();
+  CardinalityOptions options;
+  options.set_compute_level(CardinalityOptions::CARDINALITY_COMPUTE_MODERATE);
+  int64_t cardinality = finalized_dataset->Cardinality(options);
   if (cardinality == kInfiniteCardinality ||
       cardinality == kUnknownCardinality) {
     return tensorflow::errors::FailedPrecondition(
diff --git a/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc b/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc
index 1706d7a416ac76..93ca81fc537f32 100644
--- a/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc
+++ b/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/fixed_length_record_dataset_op.h"
 
-#include "tensorflow/core/data/file_utils.h"
 #include "tensorflow/core/data/name_utils.h"
+#include "tensorflow/core/data/utils.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -188,7 +188,7 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
               dataset()->record_bytes_, " bytes).");
         }
         TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile(
-            file_utils::TranslateFileName(next_filename), &file_));
+            TranslateFileName(next_filename), &file_));
         input_buffer_ = absl::make_unique<io::InputBuffer>(
             file_.get(), dataset()->buffer_size_);
         TF_RETURN_IF_ERROR(input_buffer_->SkipNBytes(dataset()->header_bytes_));
@@ -233,7 +233,7 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
             ctx->env()->GetFileSize(current_filename, &file_size));
         file_pos_limit_ = file_size - dataset()->footer_bytes_;
         TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile(
-            file_utils::TranslateFileName(current_filename), &file_));
+            TranslateFileName(current_filename), &file_));
         input_buffer_ = absl::make_unique<io::InputBuffer>(
             file_.get(), dataset()->buffer_size_);
         TF_RETURN_IF_ERROR(input_buffer_->Seek(current_pos));
@@ -352,8 +352,7 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
           }
         }
         TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile(
-            file_utils::TranslateFileName(
-                dataset()->filenames_[current_file_index_]),
+            TranslateFileName(dataset()->filenames_[current_file_index_]),
             &file_));
         if (!dataset()->compression_type_.empty()) {
           const io::ZlibCompressionOptions zlib_options =
@@ -417,8 +416,7 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
       file_.reset();
       if (current_pos >= 0) {  // There was an active buffered_input_stream_.
         TF_RETURN_IF_ERROR(ctx->env()->NewRandomAccessFile(
-            file_utils::TranslateFileName(
-                dataset()->filenames_[current_file_index_]),
+            TranslateFileName(dataset()->filenames_[current_file_index_]),
             &file_));
         const io::ZlibCompressionOptions zlib_options =
             dataset()->compression_type_ == kZLIB
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index de7b3b8453ecbd..fa9d81a2a70011 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/data/dataset_utils.h"
 #include "tensorflow/core/data/root_dataset.h"
 #include "tensorflow/core/data/serialization_utils.h"
+#include "tensorflow/core/data/utils.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/metrics.h"
@@ -154,8 +155,8 @@ Status IteratorResource::GetNext(OpKernelContext* ctx,
                                    out_tensors, end_of_sequence);
   if (collect_metrics_) {
     const uint64 end_time_us = ctx->env()->NowMicros();
-    metrics::RecordTFDataGetNextDuration(safe_sub(end_time_us, start_time_us));
-    metrics::RecordTFDataBytesFetched(GetTotalBytes(*out_tensors));
+    AddLatencySample(safe_sub(end_time_us, start_time_us));
+    IncrementThroughput(GetTotalBytes(*out_tensors));
     mutex_lock l(mu_);
     metrics::RecordTFDataIteratorLifetime(
         safe_sub(end_time_us, get_next_end_time_us_));
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index f3078d12b8816c..b88e57fff5d850 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -81,6 +81,14 @@ class MapDatasetOp::Dataset : public DatasetBase {
     }
   }
 
+  int64_t CardinalityInternal(CardinalityOptions options) const override {
+    if (preserve_cardinality_) {
+      return input_->Cardinality(options);
+    } else {
+      return kUnknownCardinality;
+    }
+  }
+
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
     return Status::OK();
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index 2c8da8f9fe4208..b59ddc14d6160e 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -133,6 +133,14 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
     }
   }
 
+  int64_t CardinalityInternal(CardinalityOptions options) const override {
+    if (preserve_cardinality_) {
+      return input_->Cardinality(options);
+    } else {
+      return kUnknownCardinality;
+    }
+  }
+
   Status Get(OpKernelContext* ctx, int64 index,
              std::vector<Tensor>* out_tensors) const override {
     TF_RETURN_IF_ERROR(CheckRandomAccessCompatible(index));
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 2b5099a36a22ad..33a408cd368168 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -95,6 +95,10 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
 
   int64_t CardinalityInternal() const override { return input_->Cardinality(); }
 
+  int64_t CardinalityInternal(CardinalityOptions options) const override {
+    return input_->Cardinality(options);
+  }
+
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
     return Status::OK();
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index 7f2f452d80a390..cd4867048b9be0 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -194,6 +194,18 @@ class RangeDatasetOp::Dataset : public DatasetBase {
     }
   }
 
+  int64_t CardinalityInternal(CardinalityOptions options) const override {
+    // If start_ == stop_ or if the sign of stop_ - start_ and step do not agree
+    // (or are zero), return zero.
+    if (sgn(stop_ - start_) * sgn(step_) <= 0) {
+      return 0;
+    } else if (step_ > 0) {
+      return std::max(int64_t{0}, (stop_ - start_ - 1) / step_ + 1);
+    } else {
+      return std::max(int64_t{0}, (start_ - stop_ - 1) / -step_ + 1);
+    }
+  }
+
   Status MakeSplitProviders(std::vector<std::unique_ptr<SplitProvider>>*
                                 split_providers) const override {
     split_providers->push_back(
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index 50c69847c895fa..5e922f280b0b51 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -91,6 +91,23 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
     return count_ * n;
   }
 
+  int64_t CardinalityInternal(CardinalityOptions options) const override {
+    int64_t n = input_->Cardinality(options);
+    if (count_ < 0) {
+      if (n == 0) {
+        return 0;
+      }
+      return kInfiniteCardinality;
+    }
+    if (count_ == 0) {
+      return 0;
+    }
+    if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+      return n;
+    }
+    return count_ * n;
+  }
+
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
     return Status::OK();
diff --git a/tensorflow/core/kernels/data/shard_dataset_op.cc b/tensorflow/core/kernels/data/shard_dataset_op.cc
index d92e44149a8b30..a6f4a6ac6d8035 100644
--- a/tensorflow/core/kernels/data/shard_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shard_dataset_op.cc
@@ -86,6 +86,14 @@ class ShardDatasetOp::Dataset : public DatasetBase {
     return n / num_shards_ + (index_ < n % num_shards_ ? 1 : 0);
   }
 
+  int64_t CardinalityInternal(CardinalityOptions options) const override {
+    int64_t n = input_->Cardinality(options);
+    if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+      return n;
+    }
+    return n / num_shards_ + (index_ < n % num_shards_ ? 1 : 0);
+  }
+
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
     return Status::OK();
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index df56aec8a98d5b..03afb45fe80ef4 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -119,6 +119,16 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
     }
   }
 
+  int64_t CardinalityInternal(CardinalityOptions options) const override {
+    if (count_ == -1 || input_->Cardinality(options) == kInfiniteCardinality) {
+      return kInfiniteCardinality;
+    } else if (input_->Cardinality(options) == kUnknownCardinality) {
+      return kUnknownCardinality;
+    } else {
+      return input_->Cardinality(options) * count_;
+    }
+  }
+
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
     return Status::OK();
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index dde0be958eaccc..2f43ee8188ac3b 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -75,6 +75,14 @@ class SkipDatasetOp::Dataset : public DatasetBase {
     return count_ < 0 ? 0 : std::max(int64_t{0}, n - count_);
   }
 
+  int64_t CardinalityInternal(CardinalityOptions options) const override {
+    int64_t n = input_->Cardinality(options);
+    if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+      return n;
+    }
+    return count_ < 0 ? 0 : std::max(int64_t{0}, n - count_);
+  }
+
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
     return Status::OK();
diff --git a/tensorflow/core/kernels/data/take_dataset_op.cc b/tensorflow/core/kernels/data/take_dataset_op.cc
index 7bf676c6f97a6f..e27d0946e8ed4c 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.cc
+++ b/tensorflow/core/kernels/data/take_dataset_op.cc
@@ -71,6 +71,19 @@ int64_t TakeDataset::CardinalityInternal() const {
   } else if (count_ == kInfiniteCardinality) {
     return n;
   }
+  return std::min(n, count_);
+}
+
+int64_t TakeDataset::CardinalityInternal(CardinalityOptions options) const {
+  int64_t n = input_->Cardinality(options);
+  if (n == kUnknownCardinality) {
+    return kUnknownCardinality;
+  }
+  if (n == kInfiniteCardinality) {
+    return count_;
+  } else if (count_ == kInfiniteCardinality) {
+    return n;
+  }
 
   return std::min(n, count_);
 }
diff --git a/tensorflow/core/kernels/data/take_dataset_op.h b/tensorflow/core/kernels/data/take_dataset_op.h
index a75e0861c92747..8c894c0ba13a35 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.h
+++ b/tensorflow/core/kernels/data/take_dataset_op.h
@@ -40,6 +40,8 @@ class TakeDataset : public DatasetBase {
 
   int64_t CardinalityInternal() const override;
 
+  int64_t CardinalityInternal(CardinalityOptions options) const override;
+
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override;
 
   Status Get(OpKernelContext* ctx, int64 index,
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index 690895c8f8f870..0f7b3d27d62537 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -74,6 +74,10 @@ class TensorDatasetOp::Dataset : public DatasetBase {
 
   int64_t CardinalityInternal() const override { return 1LL; }
 
+  int64_t CardinalityInternal(CardinalityOptions options) const override {
+    return 1LL;
+  }
+
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 3c8330cb081d77..7d0cbe44820186 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -85,6 +85,10 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
     return tensors_[0].dim_size(0);
   }
 
+  int64_t CardinalityInternal(CardinalityOptions options) const override {
+    return tensors_[0].dim_size(0);
+  }
+
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     return Status::OK();
   }
diff --git a/tensorflow/core/kernels/data/text_line_dataset_op.cc b/tensorflow/core/kernels/data/text_line_dataset_op.cc
index d2f65a79f324cf..14e8fba119c249 100644
--- a/tensorflow/core/kernels/data/text_line_dataset_op.cc
+++ b/tensorflow/core/kernels/data/text_line_dataset_op.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/text_line_dataset_op.h"
 
-#include "tensorflow/core/data/file_utils.h"
 #include "tensorflow/core/data/name_utils.h"
+#include "tensorflow/core/data/utils.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -191,8 +191,7 @@ class TextLineDatasetOp::Dataset : public DatasetBase {
 
       // Actually move on to next file.
       TF_RETURN_IF_ERROR(env->NewRandomAccessFile(
-          file_utils::TranslateFileName(
-              dataset()->filenames_[current_file_index_]),
+          TranslateFileName(dataset()->filenames_[current_file_index_]),
           &file_));
       input_stream_ =
           absl::make_unique<io::RandomAccessInputStream>(file_.get(), false);
diff --git a/tensorflow/core/kernels/data/tf_record_dataset_op.cc b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
index a71f21299317bd..0ee168f227b18e 100644
--- a/tensorflow/core/kernels/data/tf_record_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/tf_record_dataset_op.h"
 
-#include "tensorflow/core/data/file_utils.h"
 #include "tensorflow/core/data/name_utils.h"
+#include "tensorflow/core/data/utils.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -248,8 +248,7 @@ class TFRecordDatasetOp::Dataset : public DatasetBase {
 
       // Actually move on to next file.
       TF_RETURN_IF_ERROR(env->NewRandomAccessFile(
-          file_utils::TranslateFileName(
-              dataset()->filenames_[current_file_index_]),
+          TranslateFileName(dataset()->filenames_[current_file_index_]),
           &file_));
       reader_ = absl::make_unique<io::SequentialRecordReader>(
           file_.get(), dataset()->options_);
diff --git a/tensorflow/core/kernels/data/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
index fce8cc908bcc1c..35ccdcc9b55871 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -100,6 +100,21 @@ class ZipDatasetOp::Dataset : public DatasetBase {
     return result;
   }
 
+  int64_t CardinalityInternal(CardinalityOptions options) const override {
+    int64_t result = kInfiniteCardinality;
+    for (const auto& input : inputs_) {
+      int64_t n = input->Cardinality(options);
+      if (n == kUnknownCardinality) {
+        return kUnknownCardinality;
+      }
+      if (n != kInfiniteCardinality &&
+          (result == kInfiniteCardinality || n < result)) {
+        result = n;
+      }
+    }
+    return result;
+  }
+
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     for (const auto& input : inputs_) {
       inputs->push_back(input);
diff --git a/tensorflow/core/kernels/fake_quant_ops_functor.h b/tensorflow/core/kernels/fake_quant_ops_functor.h
index 045a96ac1e0e37..e6efae10553940 100644
--- a/tensorflow/core/kernels/fake_quant_ops_functor.h
+++ b/tensorflow/core/kernels/fake_quant_ops_functor.h
@@ -87,13 +87,15 @@ struct FakeQuantWithMinMaxArgsFunctor {
     float nudged_min, nudged_max, nudged_scale;
     Nudge(min, max, quant_min, quant_max, &nudged_min, &nudged_max,
           &nudged_scale);
+
     const float inv_nudged_scale = 1.0f / nudged_scale;
+    const float quant_zero = floor(-nudged_min * inv_nudged_scale + 0.5f);
 
     auto clamped = inputs.cwiseMin(nudged_max).cwiseMax(nudged_min);
     auto clamped_shifted = clamped - nudged_min;
     outputs.device(d) =
-        (clamped_shifted * inv_nudged_scale + 0.5f).floor() * nudged_scale +
-        nudged_min;
+        (clamped_shifted * inv_nudged_scale - quant_zero + 0.5f).floor() *
+        nudged_scale;
   }
 };
 
@@ -138,13 +140,17 @@ struct FakeQuantWithMinMaxVarsFunctor {
     float nudged_min, nudged_max, nudged_scale;
     Nudge(min_val, max_val, quant_min, quant_max, &nudged_min, &nudged_max,
           &nudged_scale);
+
+    const float inv_nudged_scale = 1.0f / nudged_scale;
+    const float quant_zero = floor(-nudged_min * inv_nudged_scale + 0.5f);
     const auto nudged_scale_repl = inputs.constant(nudged_scale);
+    // const auto inv_nudged_scale_repl = inputs.constant(inv_nudged_scale);
 
     const auto clamped = inputs.cwiseMin(nudged_max).cwiseMax(nudged_min);
     const auto clamped_shifted = clamped - nudged_min;
-    outputs.device(d) = (clamped_shifted / nudged_scale_repl + 0.5f).floor() *
-                            nudged_scale_repl +
-                        nudged_min;
+    outputs.device(d) =
+        (clamped_shifted / nudged_scale_repl - quant_zero + 0.5f).floor() *
+        nudged_scale_repl;
   }
 };
 
@@ -212,13 +218,17 @@ struct FakeQuantWithMinMaxVarsPerChannelFunctor {
       float nudged_min, nudged_max, nudged_scale;
       Nudge(min_val, max_val, quant_min, quant_max, &nudged_min, &nudged_max,
             &nudged_scale);
+
+      const float inv_nudged_scale = 1.0f / nudged_scale;
+      const float quant_zero = floor(-nudged_min * inv_nudged_scale + 0.5f);
+
       const auto clamped =
           inputs.chip<1>(i).cwiseMin(nudged_max).cwiseMax(nudged_min);
       const auto clamped_shifted = clamped - nudged_min;
 
       outputs.chip<1>(i).device(d) =
-          (clamped_shifted / nudged_scale + 0.5f).floor() * nudged_scale +
-          nudged_min;
+          (clamped_shifted * inv_nudged_scale - quant_zero + 0.5f).floor() *
+          nudged_scale;
     }
   }
 };
diff --git a/tensorflow/core/kernels/fake_quant_ops_test.cc b/tensorflow/core/kernels/fake_quant_ops_test.cc
index 5f62bc37ea58c0..9f832ce9915189 100644
--- a/tensorflow/core/kernels/fake_quant_ops_test.cc
+++ b/tensorflow/core/kernels/fake_quant_ops_test.cc
@@ -54,7 +54,16 @@ class QuantOpsTest : public OpsTestBase {
                                       const bool narrow_range, const float min,
                                       const float max, const TensorShape& shape,
                                       const gtl::ArraySlice<float> data,
-                                      gtl::ArraySlice<float> expected_data) {
+                                      gtl::ArraySlice<float> expected_data,
+                                      const double atol = -1.0,
+                                      const double rtol = -1.0,
+                                      const DeviceType device = DEVICE_CPU) {
+    if (device == DEVICE_GPU) {
+      SetDevice(device,
+                std::unique_ptr<tensorflow::Device>(DeviceFactory::NewDevice(
+                    "GPU", {}, "/job:a/replica:0/task:0")));
+    }
+
     TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxArgs")
                      .Input(FakeInput(DT_FLOAT))  // inputs
                      .Attr("min", min)
@@ -70,16 +79,26 @@ class QuantOpsTest : public OpsTestBase {
     TF_ASSERT_OK(RunOpKernel());
 
     Tensor* output = GetOutput(0);
+    TF_EXPECT_OK(device_->Sync());
     Tensor expected(allocator(), DT_FLOAT, shape);
     FillValues<float>(&expected, expected_data);
-    ExpectClose(expected, *output);
+    ExpectClose(expected, *output, atol, rtol);
   }
 
   void RunTestFakeQuantWithMinMaxVars(const int num_bits,
                                       const bool narrow_range, const float min,
                                       const float max, const TensorShape& shape,
                                       const gtl::ArraySlice<float> data,
-                                      gtl::ArraySlice<float> expected_data) {
+                                      gtl::ArraySlice<float> expected_data,
+                                      const double atol = -1.0,
+                                      const double rtol = -1.0,
+                                      const DeviceType device = DEVICE_CPU) {
+    if (device == DEVICE_GPU) {
+      SetDevice(device,
+                std::unique_ptr<tensorflow::Device>(DeviceFactory::NewDevice(
+                    "GPU", {}, "/job:a/replica:0/task:0")));
+    }
+
     TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVars")
                      .Input(FakeInput(DT_FLOAT))  // inputs
                      .Input(FakeInput(DT_FLOAT))  // min
@@ -101,14 +120,22 @@ class QuantOpsTest : public OpsTestBase {
     Tensor* output = GetOutput(0);
     Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3}));
     FillValues<float>(&expected, expected_data);
-    ExpectClose(expected, *output);
+    ExpectClose(expected, *output, atol, rtol);
   }
 
   void RunTestFakeQuantWithMinMaxVarsPerChannel(
       const int num_bits, const bool narrow_range,
       const TensorShape& minmax_shape, const gtl::ArraySlice<float> min,
       const gtl::ArraySlice<float> max, const TensorShape& shape,
-      const gtl::ArraySlice<float> data, gtl::ArraySlice<float> expected_data) {
+      const gtl::ArraySlice<float> data, gtl::ArraySlice<float> expected_data,
+      const double atol = -1.0, const double rtol = -1.0,
+      const DeviceType device = DEVICE_CPU) {
+    if (device == DEVICE_GPU) {
+      SetDevice(device,
+                std::unique_ptr<tensorflow::Device>(DeviceFactory::NewDevice(
+                    "GPU", {}, "/job:a/replica:0/task:0")));
+    }
+
     TF_EXPECT_OK(NodeDefBuilder("op", "FakeQuantWithMinMaxVarsPerChannel")
                      .Input(FakeInput(DT_FLOAT))  // inputs
                      .Input(FakeInput(DT_FLOAT))  // min
@@ -130,10 +157,54 @@ class QuantOpsTest : public OpsTestBase {
     Tensor* output = GetOutput(0);
     Tensor expected(allocator(), DT_FLOAT, shape);
     FillValues<float>(&expected, expected_data);
-    ExpectClose(expected, *output);
+    ExpectClose(expected, *output, atol, rtol);
   }
 };
 
+TEST_F(QuantOpsTest, WithArgsSymmetricRangeZeroInput_RegularRange) {
+  // Original quantization range: [-10, 10], scale: 20/255.
+  // Original zero point: 127.5, nudged zero point 128.0.
+  // Expected quantized values: 0.0.
+  RunTestFakeQuantWithMinMaxArgs(8, false, -10.0f, 10.0f, TensorShape({2, 3}),
+                                 {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                                 {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, 0.0,
+                                 0.0);
+}
+
+#if GOOGLE_CUDA
+TEST_F(QuantOpsTest, WithArgsSymmetricRangeZeroInput_RegularRange_Gpu) {
+  // Original quantization range: [-10, 10], scale: 20/255.
+  // Original zero point: 127.5, nudged zero point 128.0.
+  // Expected quantized values: 0.0.
+  RunTestFakeQuantWithMinMaxArgs(8, false, -10.0f, 10.0f, TensorShape({2, 3}),
+                                 {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                                 {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, 0.0, 0.0,
+                                 DEVICE_GPU);
+}
+#endif
+
+TEST_F(QuantOpsTest, WithArgsSymmetricRangeZeroInput_NarrowRange) {
+  // Original quantization range: [-10, 10], scale: 20/254.
+  // Original zero point: 128., no nudging necessary.
+  // Expected quantized values: 0.0.
+  RunTestFakeQuantWithMinMaxArgs(8, true, -10.0f, 10.0f, TensorShape({2, 3}),
+                                 {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                                 {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, 0.0,
+                                 0.0);
+}
+
+#if GOOGLE_CUDA
+TEST_F(QuantOpsTest, WithArgsSymmetricRangeZeroInput_NarrowRange_Gpu) {
+  // Original quantization range: [-10, 10], scale: 20/254.
+  // Original zero point: 128., no nudging necessary.
+  // Expected quantized values: 0.0.
+  RunTestFakeQuantWithMinMaxArgs(8, true, -10.0f, 10.0f, TensorShape({2, 3}),
+                                 {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                                 {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, 0.0, 0.0,
+                                 DEVICE_GPU);
+}
+#endif
+
 TEST_F(QuantOpsTest, WithArgsNoNudging_RegularRange) {
   // Original quantization range: [-10 + 0 / 4, -10 + 255 / 4], scale: 1/4.
   // Original zero point: 40, no nudging necessary.
@@ -481,6 +552,50 @@ TEST_F(QuantOpsTest, WithVars_ZeroMinAndMax) {
                                  {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f});
 }
 
+TEST_F(QuantOpsTest, WithVarsSymmetricRangeZeroInput_RegularRange) {
+  // Original quantization range: [-10, 10], scale: 20/255.
+  // Original zero point: 127.5, nudged zero point 128.
+  // Expected quantized values: 0.
+  RunTestFakeQuantWithMinMaxVars(8, false, -10.0f, 10.0f, TensorShape({2, 3}),
+                                 {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                                 {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, 0.0,
+                                 0.0);
+}
+
+#if GOOGLE_CUDA
+TEST_F(QuantOpsTest, WithVarsSymmetricRangeZeroInput_RegularRange_Gpu) {
+  // Original quantization range: [-10, 10], scale: 20/255.
+  // Original zero point: 127.5, nudged zero point 128.
+  // Expected quantized values: 0.
+  RunTestFakeQuantWithMinMaxVars(8, false, -10.0f, 10.0f, TensorShape({2, 3}),
+                                 {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                                 {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, 0.0, 0.0,
+                                 DEVICE_GPU);
+}
+#endif
+
+TEST_F(QuantOpsTest, WithVarsSymmetricRangeZeroInput_NarrowRange) {
+  // Original quantization range: [-10, 10], scale: 20/254.
+  // Original zero point: 128., no nudging necessary.
+  // Expected quantized values: 0.
+  RunTestFakeQuantWithMinMaxVars(8, true, -10.0f, 10.0f, TensorShape({2, 3}),
+                                 {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                                 {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, 0.0,
+                                 0.0);
+}
+
+#if GOOGLE_CUDA
+TEST_F(QuantOpsTest, WithVarsSymmetricRangeZeroInput_NarrowRange_Gpu) {
+  // Original quantization range: [-10, 10], scale: 20/254.
+  // Original zero point: 128., no nudging necessary.
+  // Expected quantized values: 0.
+  RunTestFakeQuantWithMinMaxVars(8, true, -10.0f, 10.0f, TensorShape({2, 3}),
+                                 {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+                                 {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}, 0.0, 0.0,
+                                 DEVICE_GPU);
+}
+#endif
+
 TEST_F(QuantOpsTest, WithVarsNoNudging_RegularRange) {
   // Original quantization range: [-10 + 0 / 4, -10 + 255 / 4], scale: 1/4.
   // Original zero point: 40, no nudging necessary.
@@ -868,6 +983,52 @@ TEST_F(QuantOpsTest, WithVarsPerChannel_ZeroMinAndMax) {
       {0.0f, 0.0f, 0.0f, 0.0f});
 }
 
+TEST_F(QuantOpsTest, WithVarsPerChannelSymmetricRangeZeroInput_RegularRange) {
+  // Original quantization range: [-10, 10], scale: 20/255.
+  // Original zero point: 127.5, nudged zero point 128.0.
+  // Expected quantized values: 0.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, false, TensorShape({4}), {-10.0f, -10.0f, -10.0f, -10.0f},
+      {10.0f, 10.0f, 10.0f, 10.0f}, TensorShape({4}), {0.0f, 0.0f, 0.0f, 0.0f},
+      {0.0f, 0.0f, 0.0f, 0.0f}, 0.0, 0.0);
+}
+
+#if GOOGLE_CUDA
+TEST_F(QuantOpsTest,
+       WithVarsPerChannelSymmetricRangeZeroInput_RegularRange_Gpu) {
+  // Original quantization range: [-10, 10], scale: 20/255.
+  // Original zero point: 127.5, nudged zero point 128.0.
+  // Expected quantized values: 0.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, false, TensorShape({4}), {-10.0f, -10.0f, -10.0f, -10.0f},
+      {10.0f, 10.0f, 10.0f, 10.0f}, TensorShape({4}), {0.0f, 0.0f, 0.0f, 0.0f},
+      {0.0f, 0.0f, 0.0f, 0.0f}, 0.0, 0.0, DEVICE_GPU);
+}
+#endif
+
+TEST_F(QuantOpsTest, WithVarsPerChannelSymmetricRangeZeroInput_NarrowRange) {
+  // Original quantization range: [-10, 10], scale: 20/254.
+  // Original zero point: 128.0, no nudging necessary.
+  // Expected quantized values: 0.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, true, TensorShape({4}), {-10.0f, -10.0f, -10.0f, -10.0f},
+      {10.0f, 10.0f, 10.0f, 10.0f}, TensorShape({4}), {0.0f, 0.0f, 0.0f, 0.0f},
+      {0.0f, 0.0f, 0.0f, 0.0f}, 0.0, 0.0);
+}
+
+#if GOOGLE_CUDA
+TEST_F(QuantOpsTest,
+       WithVarsPerChannelSymmetricRangeZeroInput_NarrowRange_Gpu) {
+  // Original quantization range: [-10, 10], scale: 20/254.
+  // Original zero point: 128.0, no nudging necessary.
+  // Expected quantized values: 0.
+  RunTestFakeQuantWithMinMaxVarsPerChannel(
+      8, true, TensorShape({4}), {-10.0f, -10.0f, -10.0f, -10.0f},
+      {10.0f, 10.0f, 10.0f, 10.0f}, TensorShape({4}), {0.0f, 0.0f, 0.0f, 0.0f},
+      {0.0f, 0.0f, 0.0f, 0.0f}, 0.0, 0.0, DEVICE_GPU);
+}
+#endif
+
 TEST_F(QuantOpsTest, WithVarsPerChannelDim1NudgedDown_RegularRange) {
   // Original quantization ranges: [-0.4 / 4 + 0 / 4, -0.4 / 4 + 255 / 4].
   // Scale: 1/4,  original zero point: 0.4, nudged to 0.
diff --git a/tensorflow/core/kernels/fractional_avg_pool_op.cc b/tensorflow/core/kernels/fractional_avg_pool_op.cc
index afc72287edadc1..b3e65aeaee22f8 100644
--- a/tensorflow/core/kernels/fractional_avg_pool_op.cc
+++ b/tensorflow/core/kernels/fractional_avg_pool_op.cc
@@ -311,15 +311,26 @@ class FractionalAvgPoolGradOp : public OpKernel {
     for (int64_t b = 0; b < out_batch; ++b) {
       for (int64_t r = 0; r < out_rows; ++r) {
         const int64_t in_row_start = row_seq_tensor_flat(r);
+
         int64_t in_row_end = overlapping_ ? row_seq_tensor_flat(r + 1)
                                           : row_seq_tensor_flat(r + 1) - 1;
         in_row_end = std::min(in_row_end, in_max_row_index);
+        OP_REQUIRES(context, in_row_start >= 0 && in_row_end >= 0,
+                    errors::InvalidArgument(
+                        "Row sequence tensor values must not be negative, got ",
+                        row_seq_tensor_flat));
+
         for (int64_t c = 0; c < out_cols; ++c) {
           const int64_t in_col_start = col_seq_tensor_flat(c);
           int64_t in_col_end = overlapping_ ? col_seq_tensor_flat(c + 1)
                                             : col_seq_tensor_flat(c + 1) - 1;
           in_col_end = std::min(in_col_end, in_max_col_index);
 
+          OP_REQUIRES(
+              context, in_col_start >= 0 && in_col_end >= 0,
+              errors::InvalidArgument(
+                  "Column sequence tensor values must not be negative, got ",
+                  col_seq_tensor_flat));
           const int64_t num_elements_in_pooling_cell =
               (in_row_end - in_row_start + 1) * (in_col_end - in_col_start + 1);
           const int64_t out_index = (b * out_rows + r) * out_cols + c;
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 881d9a3a7354d8..1b391480e02626 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -333,23 +333,24 @@ class CaseOp : public AsyncOpKernel {
 
 // TODO(drpng): remove this.
 REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_CPU), IfOp);
-REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_GPU).HostMemory("cond"),
+REGISTER_KERNEL_BUILDER(Name("_If").Device(DEVICE_DEFAULT).HostMemory("cond"),
                         IfOp);
 
 REGISTER_KERNEL_BUILDER(Name("If").Device(DEVICE_CPU), IfOp);
-REGISTER_KERNEL_BUILDER(Name("If").Device(DEVICE_GPU).HostMemory("cond"), IfOp);
+REGISTER_KERNEL_BUILDER(Name("If").Device(DEVICE_DEFAULT).HostMemory("cond"),
+                        IfOp);
 
 REGISTER_KERNEL_BUILDER(Name("Case").Device(DEVICE_CPU), CaseOp);
 REGISTER_KERNEL_BUILDER(
-    Name("Case").Device(DEVICE_GPU).HostMemory("branch_index"), CaseOp);
+    Name("Case").Device(DEVICE_DEFAULT).HostMemory("branch_index"), CaseOp);
 REGISTER_KERNEL_BUILDER(Name("StatelessCase").Device(DEVICE_CPU), CaseOp);
 REGISTER_KERNEL_BUILDER(
-    Name("StatelessCase").Device(DEVICE_GPU).HostMemory("branch_index"),
+    Name("StatelessCase").Device(DEVICE_DEFAULT).HostMemory("branch_index"),
     CaseOp);
 
 REGISTER_KERNEL_BUILDER(Name("StatelessIf").Device(DEVICE_CPU), IfOp);
 REGISTER_KERNEL_BUILDER(
-    Name("StatelessIf").Device(DEVICE_GPU).HostMemory("cond"), IfOp);
+    Name("StatelessIf").Device(DEVICE_DEFAULT).HostMemory("cond"), IfOp);
 
 class WhileOp : public AsyncOpKernel {
  public:
@@ -691,13 +692,13 @@ class WhileOp : public AsyncOpKernel {
 };
 // TODO(drpng): remove these.
 REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_CPU), WhileOp);
-REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_GPU), WhileOp);
+REGISTER_KERNEL_BUILDER(Name("_While").Device(DEVICE_DEFAULT), WhileOp);
 
 REGISTER_KERNEL_BUILDER(Name("While").Device(DEVICE_CPU), WhileOp);
-REGISTER_KERNEL_BUILDER(Name("While").Device(DEVICE_GPU), WhileOp);
+REGISTER_KERNEL_BUILDER(Name("While").Device(DEVICE_DEFAULT), WhileOp);
 
 REGISTER_KERNEL_BUILDER(Name("StatelessWhile").Device(DEVICE_CPU), WhileOp);
-REGISTER_KERNEL_BUILDER(Name("StatelessWhile").Device(DEVICE_GPU), WhileOp);
+REGISTER_KERNEL_BUILDER(Name("StatelessWhile").Device(DEVICE_DEFAULT), WhileOp);
 
 class ToBoolOp : public OpKernel {
  public:
@@ -848,7 +849,7 @@ class ForOp : public AsyncOpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("For").Device(DEVICE_CPU), ForOp);
 REGISTER_KERNEL_BUILDER(Name("For")
-                            .Device(DEVICE_GPU)
+                            .Device(DEVICE_DEFAULT)
                             .HostMemory("start")
                             .HostMemory("limit")
                             .HostMemory("delta"),
@@ -890,7 +891,7 @@ class FakeParamOp : public OpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(Name("FakeParam").Device(DEVICE_CPU), FakeParamOp);
-REGISTER_KERNEL_BUILDER(Name("FakeParam").Device(DEVICE_GPU), FakeParamOp);
+REGISTER_KERNEL_BUILDER(Name("FakeParam").Device(DEVICE_DEFAULT), FakeParamOp);
 
 // DeviceIndexOP returns the current device index.
 class DeviceIndexOp : public OpKernel {
@@ -921,7 +922,8 @@ class DeviceIndexOp : public OpKernel {
 
 REGISTER_KERNEL_BUILDER(Name("DeviceIndex").Device(DEVICE_CPU), DeviceIndexOp);
 REGISTER_KERNEL_BUILDER(
-    Name("DeviceIndex").Device(DEVICE_GPU).HostMemory("index"), DeviceIndexOp);
+    Name("DeviceIndex").Device(DEVICE_DEFAULT).HostMemory("index"),
+    DeviceIndexOp);
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/linalg/qr_op_impl.h b/tensorflow/core/kernels/linalg/qr_op_impl.h
index 5a334457338e02..cc78341f900d37 100644
--- a/tensorflow/core/kernels/linalg/qr_op_impl.h
+++ b/tensorflow/core/kernels/linalg/qr_op_impl.h
@@ -237,10 +237,9 @@ class QrOpGpu : public AsyncOpKernel {
       for (int batch = 0; batch < batch_size; ++batch) {
         OP_REQUIRES_OK_ASYNC(
             context,
-            solver->Geam(transa, transb, n,
-                         full_matrices_ ? m : min_size, &alpha,
-                         &input_transposed_reshaped(batch, 0, 0), m, &beta,
-                         dummy, n, &r_reshaped(batch, 0, 0), n),
+            solver->Geam(transa, transb, n, full_matrices_ ? m : min_size,
+                         &alpha, &input_transposed_reshaped(batch, 0, 0), m,
+                         &beta, dummy, n, &r_reshaped(batch, 0, 0), n),
             done);
       }
     }
diff --git a/tensorflow/core/kernels/map_stage_op.cc b/tensorflow/core/kernels/map_stage_op.cc
index aee53ae78d3af8..463e2a8ced5c2d 100644
--- a/tensorflow/core/kernels/map_stage_op.cc
+++ b/tensorflow/core/kernels/map_stage_op.cc
@@ -536,6 +536,11 @@ class MapStageOp : public OpKernel {
     OP_REQUIRES(ctx, key_tensor->NumElements() > 0,
                 errors::InvalidArgument("key must not be empty"));
 
+    OP_REQUIRES(ctx, key_tensor->NumElements() == 1,
+                errors::InvalidArgument(
+                    "key must be an int64 scalar, got tensor with shape: ",
+                    key_tensor->shape()));
+
     // Create copy for insertion into Staging Area
     Tensor key(*key_tensor);
 
diff --git a/tensorflow/core/kernels/quantized_pooling_ops.cc b/tensorflow/core/kernels/quantized_pooling_ops.cc
index 17df3c223676aa..b512369b3c4dd9 100644
--- a/tensorflow/core/kernels/quantized_pooling_ops.cc
+++ b/tensorflow/core/kernels/quantized_pooling_ops.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // See docs in ../ops/nn_ops.cc.
 
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/platform/errors.h"
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -117,6 +119,18 @@ class QuantizedMaxPoolingOp : public MaxPoolingOp<Device, T> {
       : MaxPoolingOp<Device, T>(context) {}
 
   void Compute(OpKernelContext* context) override {
+    auto min_input_tensor = context->input(1);
+    auto max_input_tensor = context->input(2);
+    OP_REQUIRES(
+        context, min_input_tensor.NumElements() == 1,
+        errors::InvalidArgument(
+            "min_input must be a scalar float value, got tensor with shape ",
+            min_input_tensor.shape()));
+    OP_REQUIRES(
+        context, max_input_tensor.NumElements() == 1,
+        errors::InvalidArgument(
+            "max_input must be a scalar float value, got tensor with shape ",
+            max_input_tensor.shape()));
     const float min_input = context->input(1).flat<float>()(0);
     const float max_input = context->input(2).flat<float>()(0);
     MaxPoolingOp<Device, T>::Compute(context);
diff --git a/tensorflow/core/kernels/save_restore_v2_ops.cc b/tensorflow/core/kernels/save_restore_v2_ops.cc
index 809a26030b850e..ef193701b0dcbd 100644
--- a/tensorflow/core/kernels/save_restore_v2_ops.cc
+++ b/tensorflow/core/kernels/save_restore_v2_ops.cc
@@ -20,9 +20,11 @@ limitations under the License.
 
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/checkpoint_callback_manager.h"
 #include "tensorflow/core/kernels/save_restore_tensor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -157,6 +159,19 @@ class SaveV2 : public OpKernel {
     }
     OP_REQUIRES_OK(context, writer.Finish());
     VLOG(1) << "Done BundleWriter, prefix_string: " << prefix_string;
+
+    ResourceMgr* resource_manager = context->resource_manager();
+    if (resource_manager != nullptr) {
+      // Trigger callbacks if CheckpointCallbackManager exists.
+      checkpoint::CheckpointCallbackManager* checkpoint_callback_manager;
+      Status status = resource_manager->Lookup(
+          context->resource_manager()->default_container(),
+          std::string(checkpoint::kCheckpointCallbackManagerResourceName),
+          &checkpoint_callback_manager);
+      if (status.ok()) {
+        checkpoint_callback_manager->Save(prefix_string);
+      }
+    }
   }
 };
 REGISTER_KERNEL_BUILDER(Name("SaveV2").Device(DEVICE_CPU), SaveV2);
@@ -205,6 +220,19 @@ class RestoreV2 : public OpKernel {
     // If found, invokes the V2 reader.
     OP_REQUIRES_OK(context, RestoreTensorsV2(context, prefix, tensor_names,
                                              shape_and_slices, dtypes_));
+
+    ResourceMgr* resource_manager = context->resource_manager();
+    if (resource_manager != nullptr) {
+      // Trigger callbacks if CheckpointCallbackManager exists.
+      checkpoint::CheckpointCallbackManager* checkpoint_callback_manager;
+      Status status = resource_manager->Lookup(
+          context->resource_manager()->default_container(),
+          std::string(checkpoint::kCheckpointCallbackManagerResourceName),
+          &checkpoint_callback_manager);
+      if (status.ok()) {
+        checkpoint_callback_manager->Restore(prefix_string);
+      }
+    }
   }
 
  private:
diff --git a/tensorflow/core/kernels/string_ngrams_op.cc b/tensorflow/core/kernels/string_ngrams_op.cc
index 6acf846f95812f..07904fa04019ff 100644
--- a/tensorflow/core/kernels/string_ngrams_op.cc
+++ b/tensorflow/core/kernels/string_ngrams_op.cc
@@ -152,6 +152,16 @@ class StringNGramsOp : public tensorflow::OpKernel {
         // We don't have to worry about dynamic padding sizes here: if padding
         // was dynamic, every sequence would have had sufficient padding to
         // generate at least one ngram.
+
+        // If reached here, pad_width should be > 0, pad_width_ = -1,
+        // which indicates max(ngram_widths) - 1 cannot be used here since
+        // ngram_width is not known.
+        OP_REQUIRES(
+            context, pad_width_ >= 0,
+            errors::InvalidArgument("Pad width should be >= 0 when "
+                                    "preserve_short_sequences is True and "
+                                    "ngram_widths are not provided, got ",
+                                    pad_width_));
         int ngram_width = data_length + 2 * pad_width_;
         auto output_start = &ngrams_data[output_start_idx];
         int num_ngrams = 1;
diff --git a/tensorflow/core/kernels/unravel_index_op.cc b/tensorflow/core/kernels/unravel_index_op.cc
index cff04387d60247..b8524e87598ceb 100644
--- a/tensorflow/core/kernels/unravel_index_op.cc
+++ b/tensorflow/core/kernels/unravel_index_op.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstdint>
+
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/types.h"
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/framework/op_kernel.h"
@@ -35,7 +39,8 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 template <typename Tidx>
 class UnravelIndexOp : public OpKernel {
  public:
-  explicit UnravelIndexOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  explicit UnravelIndexOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx), dtidx_(DataTypeToEnum<Tidx>::v()) {}
 
   void Compute(OpKernelContext* ctx) override {
     const Tensor& indices_tensor = ctx->input(0);
@@ -54,12 +59,31 @@ class UnravelIndexOp : public OpKernel {
 
     auto dims = dims_tensor.vec<Tidx>();
     // Make sure dims does not contain a zero
+    double prod = 1;
+    uint64_t limit;
+    if (dtidx_ == DataType::DT_INT64) {
+      limit = kint64max;
+    } else {
+      limit = kint32max;
+    }
+
     for (int i = 0; i < dims.size(); i++) {
       OP_REQUIRES(
           ctx, dims(i) != 0,
           errors::InvalidArgument("Input dims cannot contain a dim of zero, "
                                   "but dims contains zero at index ",
                                   i));
+      OP_REQUIRES(ctx, dims(i) > 0,
+                  errors::InvalidArgument(
+                      "Input dims cannot be negative. Got dim = ", dims(i),
+                      " at index ", i));
+      // Check interger overflow
+      OP_REQUIRES(
+          ctx, prod <= limit / dims(i),
+          errors::InvalidArgument("Input dims product is causing integer "
+                                  "overflow: (",
+                                  dims, ")"));
+      prod = (prod * dims(i));
     }
 
     // Check to make sure indices is not out of boundary
@@ -132,6 +156,7 @@ class UnravelIndexOp : public OpKernel {
                strides_shifted.reshape(reshape).broadcast(bcast);
     }
   }
+  const DataType dtidx_;
 };
 
 #define REGISTER_KERNEL(type)                                               \
diff --git a/tensorflow/core/kernels/while_op_test.cc b/tensorflow/core/kernels/while_op_test.cc
index e242411e3fe122..61d0dbac3d03c8 100644
--- a/tensorflow/core/kernels/while_op_test.cc
+++ b/tensorflow/core/kernels/while_op_test.cc
@@ -47,6 +47,55 @@ class WhileOpTest : public OpsTestBase {
   SP_TimerFns timer_fns_;
 };
 
+FunctionDef LessThanOrEqualToNWithCast(int64_t N) {
+  typedef FunctionDefHelper FDH;
+  const Tensor kN = test::AsScalar<int64_t>(N);
+  return FDH::Define(
+      // Name
+      "LessThanOrEqualToNWithCast",
+      // Args
+      {"x: T"},
+      // Return values
+      {"z: bool"},
+      // Attr def
+      {"T: {float, double, int32, int64}"},
+      // Nodes
+      {
+          {{"N"}, "Const", {}, {{"value", kN}, {"dtype", DT_INT64}}},
+          {{"y"}, "_HostCast", {"N"}, {{"SrcT", DT_INT64}, {"DstT", DT_INT32}}},
+          {{"x_cst"}, "_HostCast", {"x"}, {{"SrcT", "$T"}, {"DstT", DT_INT32}}},
+          {{"z"}, "LessEqual", {"x_cst", "y"}, {{"T", DT_INT32}}},
+      });
+}
+
+FunctionDef XTimesTwoWithCast() {
+  typedef FunctionDefHelper FDH;
+  const Tensor kTwo = test::AsScalar<int64_t>(2);
+  return FDH::Define(
+      // Name
+      "XTimesTwoWithCast",
+      // Args
+      {"x: T"},
+      // Return values
+      {"y: T"},
+      // Attr def
+      {"T: {float, double, int32, int64}"},
+      // Nodes
+      {
+          {{"two"}, "Const", {}, {{"value", kTwo}, {"dtype", DT_INT64}}},
+          {{"two_cst"},
+           "_HostCast",
+           {"two"},
+           {{"SrcT", DT_INT64}, {"DstT", DT_INT32}}},
+          {{"x_cst"}, "_HostCast", {"x"}, {{"SrcT", "$T"}, {"DstT", DT_INT32}}},
+          {{"y_cast"}, "Mul", {"x_cst", "two_cst"}, {{"T", DT_INT32}}},
+          {{"y"},
+           "_HostCast",
+           {"y_cast"},
+           {{"SrcT", DT_INT32}, {"DstT", "$T"}}},
+      });
+}
+
 TEST_F(WhileOpTest, WhileOpCPUBuildWithPluggableDevice) {
   const std::string platform_name = "MY_TEST";
   const std::string platform_type = "FAKE";
@@ -71,7 +120,11 @@ TEST_F(WhileOpTest, WhileOpCPUBuildWithPluggableDevice) {
   };
 
   se_.host_memory_allocate = [](const SP_Device* const device, uint64_t size) {
+#if EIGEN_MAX_ALIGN_BYTES == 0
     return malloc(size);
+#else
+    return tensorflow::port::AlignedMalloc(size, EIGEN_MAX_ALIGN_BYTES);
+#endif
   };
   se_.host_memory_deallocate = [](const SP_Device* const device, void* mem) {
     free(mem);
@@ -80,7 +133,11 @@ TEST_F(WhileOpTest, WhileOpCPUBuildWithPluggableDevice) {
   se_.allocate = [](const SP_Device* const device, uint64_t size,
                     int64_t memory_space, SP_DeviceMemoryBase* const mem) {
     mem->struct_size = SP_DEVICE_MEMORY_BASE_STRUCT_SIZE;
+#if EIGEN_MAX_ALIGN_BYTES == 0
     mem->opaque = malloc(size);
+#else
+    mem->opaque = tensorflow::port::AlignedMalloc(size, EIGEN_MAX_ALIGN_BYTES);
+#endif
     mem->size = size;
   };
   se_.deallocate = [](const SP_Device* const device,
@@ -120,8 +177,8 @@ TEST_F(WhileOpTest, WhileOpCPUBuildWithPluggableDevice) {
   OpsTestBase::SetDevice(platform_type.c_str(), std::move(plug_device));
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
   Scope root = Scope::NewRootScope().ExitOnError();
-  FunctionDef x_times_two = test::function::XTimesTwo();
-  FunctionDef less_than_or_eq = test::function::LessThanOrEqualToN(8);
+  FunctionDef x_times_two = XTimesTwoWithCast();
+  FunctionDef less_than_or_eq = LessThanOrEqualToNWithCast(8);
 
   FunctionDefLibrary f_lib_proto;
   *f_lib_proto.add_function() = x_times_two;
@@ -130,10 +187,10 @@ TEST_F(WhileOpTest, WhileOpCPUBuildWithPluggableDevice) {
   TF_ASSERT_OK(root.graph()->AddFunctionLibrary(f_lib_proto));
   auto a = ops::Placeholder(root.WithOpName("A"), DT_FLOAT);
   AttrValue cond_func;
-  cond_func.mutable_func()->set_name("LessThanOrEqualToN");
+  cond_func.mutable_func()->set_name("LessThanOrEqualToNWithCast");
   (*cond_func.mutable_func()->mutable_attr())["T"].set_type(DT_FLOAT);
   AttrValue body_func;
-  body_func.mutable_func()->set_name("XTimesTwo");
+  body_func.mutable_func()->set_name("XTimesTwoWithCast");
   (*body_func.mutable_func()->mutable_attr())["T"].set_type(DT_FLOAT);
 
   std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(a.node())});
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AssertCardinalityDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AssertCardinalityDataset.pbtxt
index f58a5ec8f4b08f..edf77a307ece00 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AssertCardinalityDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AssertCardinalityDataset.pbtxt
@@ -59,3 +59,50 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "AssertCardinalityDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "cardinality"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AssertNextDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AssertNextDataset.pbtxt
index aa063501094c5a..f1ebd27d543b14 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AssertNextDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AssertNextDataset.pbtxt
@@ -59,3 +59,50 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "AssertNextDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "transformations"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/AutoShardDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/AutoShardDataset.pbtxt
index cc5ca177cc0a6f..465b757c8e967b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/AutoShardDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/AutoShardDataset.pbtxt
@@ -133,3 +133,68 @@ op {
     }
   }
 }
+op {
+  name: "AutoShardDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_workers"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "auto_shard_policy"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "num_replicas"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchDataset.pbtxt
index 3446b180fd47ab..39467ae1bb6d33 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchDataset.pbtxt
@@ -128,3 +128,57 @@ op {
     }
   }
 }
+op {
+  name: "BatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchDatasetV2.pbtxt
index a60150f88b4a68..a3dc3afed0f53a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchDatasetV2.pbtxt
@@ -164,3 +164,68 @@ op {
     }
   }
 }
+op {
+  name: "BatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "parallel_copy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BytesProducedStatsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BytesProducedStatsDataset.pbtxt
index 6ce75973e4174a..7cbfbbd146c83f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BytesProducedStatsDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BytesProducedStatsDataset.pbtxt
@@ -59,3 +59,50 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "BytesProducedStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CSVDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CSVDataset.pbtxt
index dd5f8f84bf8051..56e1a03ab62995 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CSVDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CSVDataset.pbtxt
@@ -135,3 +135,88 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CSVDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "header"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "field_delim"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "use_quote_delim"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "na_value"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "select_cols"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "output_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CSVDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CSVDatasetV2.pbtxt
index f4003fb938e0bb..c540e8407e8480 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CSVDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CSVDatasetV2.pbtxt
@@ -143,3 +143,92 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CSVDatasetV2"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "header"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "field_delim"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "use_quote_delim"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "na_value"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "select_cols"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "output_types"
+  }
+  input_arg {
+    name: "exclude_cols"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CacheDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CacheDataset.pbtxt
index 7387a6416490c3..8b8ec246c5be80 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CacheDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CacheDataset.pbtxt
@@ -128,3 +128,57 @@ op {
     }
   }
 }
+op {
+  name: "CacheDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/CacheDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/CacheDatasetV2.pbtxt
index 4175dd6b7ef718..43fe482cba6e0a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/CacheDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/CacheDatasetV2.pbtxt
@@ -115,3 +115,62 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "CacheDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "cache"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ChooseFastestBranchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ChooseFastestBranchDataset.pbtxt
index 13882c336d58f0..4850496a3954b3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ChooseFastestBranchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ChooseFastestBranchDataset.pbtxt
@@ -121,3 +121,81 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ChooseFastestBranchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "ratio_numerator"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "ratio_denominator"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "num_elements_per_branch"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "branches"
+    type: "list(func)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "other_arguments_lengths"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ChooseFastestDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ChooseFastestDataset.pbtxt
index 9fe14441e94413..476e834edde27e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ChooseFastestDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ChooseFastestDataset.pbtxt
@@ -73,3 +73,57 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ChooseFastestDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "num_experiments"
+    type: "int"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ConcatenateDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ConcatenateDataset.pbtxt
index 0e86bf98b38964..3bdf420d07b14b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ConcatenateDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ConcatenateDataset.pbtxt
@@ -128,3 +128,57 @@ op {
     }
   }
 }
+op {
+  name: "ConcatenateDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "another_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DataServiceDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DataServiceDataset.pbtxt
index b89430aa0f9277..c301511e2c9747 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DataServiceDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DataServiceDataset.pbtxt
@@ -260,3 +260,92 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "DataServiceDataset"
+  input_arg {
+    name: "dataset_id"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "processing_mode"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "protocol"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "job_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "max_outstanding_requests"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "iteration_counter"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "task_refresh_interval_hint_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "data_transfer_protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "target_workers"
+    type: "string"
+    default_value {
+      s: "AUTO"
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DataServiceDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DataServiceDatasetV2.pbtxt
index f366839ccf641e..bc4d2598e1f98f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DataServiceDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DataServiceDatasetV2.pbtxt
@@ -292,3 +292,100 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "DataServiceDatasetV2"
+  input_arg {
+    name: "dataset_id"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "processing_mode"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "protocol"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "job_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "consumer_index"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_consumers"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "max_outstanding_requests"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "iteration_counter"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "task_refresh_interval_hint_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "data_transfer_protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "target_workers"
+    type: "string"
+    default_value {
+      s: "AUTO"
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DataServiceDatasetV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DataServiceDatasetV3.pbtxt
index d953ec2620c304..465e1d818f3a3c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DataServiceDatasetV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DataServiceDatasetV3.pbtxt
@@ -93,3 +93,111 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "DataServiceDatasetV3"
+  input_arg {
+    name: "dataset_id"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "processing_mode"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "address"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "protocol"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "job_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "consumer_index"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_consumers"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "max_outstanding_requests"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "iteration_counter"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "task_refresh_interval_hint_ms"
+    type: "int"
+    default_value {
+      i: -1
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "data_transfer_protocol"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "target_workers"
+    type: "string"
+    default_value {
+      s: "AUTO"
+    }
+  }
+  attr {
+    name: "uncompress"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "uncompress_fn"
+    type: "func"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DenseToSparseBatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DenseToSparseBatchDataset.pbtxt
index 217f624e10569a..cb972cca8aa42f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DenseToSparseBatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DenseToSparseBatchDataset.pbtxt
@@ -67,3 +67,54 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "DenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DirectedInterleaveDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DirectedInterleaveDataset.pbtxt
index b98aae0e3a263b..61a9462fac3b59 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DirectedInterleaveDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DirectedInterleaveDataset.pbtxt
@@ -121,3 +121,64 @@ op {
     }
   }
 }
+op {
+  name: "DirectedInterleaveDataset"
+  input_arg {
+    name: "selector_input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "data_input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "stop_on_empty_dataset"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/EmptyTensorList.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/EmptyTensorList.pbtxt
index 840c88a576f681..d15fa1ad47048c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/EmptyTensorList.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/EmptyTensorList.pbtxt
@@ -63,3 +63,42 @@ op {
     }
   }
 }
+op {
+  name: "EmptyTensorList"
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "max_num_elements"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_ARRAY
+      args {
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
+      }
+    }
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalAssertNextDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalAssertNextDataset.pbtxt
index 0eb484e2cdfbf5..937afb4e0ae48f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalAssertNextDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalAssertNextDataset.pbtxt
@@ -59,3 +59,50 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalAssertNextDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "transformations"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalAutoShardDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalAutoShardDataset.pbtxt
index af86d20b56a82b..d8d8d7ffc96e4e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalAutoShardDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalAutoShardDataset.pbtxt
@@ -112,3 +112,61 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalAutoShardDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_workers"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "auto_shard_policy"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalBytesProducedStatsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalBytesProducedStatsDataset.pbtxt
index c791dd69ece9d8..10555bb3ebfbf5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalBytesProducedStatsDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalBytesProducedStatsDataset.pbtxt
@@ -59,3 +59,50 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalBytesProducedStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalCSVDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalCSVDataset.pbtxt
index 6f7ca07d564dc4..a618d55fcb6289 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalCSVDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalCSVDataset.pbtxt
@@ -135,3 +135,88 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalCSVDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "header"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "field_delim"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "use_quote_delim"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "na_value"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "select_cols"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "record_defaults"
+    type_list_attr: "output_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalChooseFastestDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalChooseFastestDataset.pbtxt
index cb08fa28496b14..2c04d58db4c0a4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalChooseFastestDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalChooseFastestDataset.pbtxt
@@ -116,3 +116,57 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalChooseFastestDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "num_experiments"
+    type: "int"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDenseToSparseBatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDenseToSparseBatchDataset.pbtxt
index 62ed3e72b3099b..c322ef95777609 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDenseToSparseBatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDenseToSparseBatchDataset.pbtxt
@@ -99,3 +99,54 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalDenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDirectedInterleaveDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDirectedInterleaveDataset.pbtxt
index f372be23eb80f8..2a877497ff29fd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDirectedInterleaveDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalDirectedInterleaveDataset.pbtxt
@@ -73,3 +73,57 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalDirectedInterleaveDataset"
+  input_arg {
+    name: "selector_input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "data_input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalGroupByReducerDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalGroupByReducerDataset.pbtxt
index 030ae7adca6e34..5e2fd15a22c908 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalGroupByReducerDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalGroupByReducerDataset.pbtxt
@@ -157,3 +157,99 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalGroupByReducerDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "init_func_other_arguments"
+    type_list_attr: "Tinit_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "finalize_func_other_arguments"
+    type_list_attr: "Tfinalize_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "init_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "finalize_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tinit_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tfinalize_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalGroupByWindowDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalGroupByWindowDataset.pbtxt
index 151d0fe46f4a63..35f9c3c1a9547d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalGroupByWindowDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalGroupByWindowDataset.pbtxt
@@ -192,3 +192,85 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalGroupByWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalIgnoreErrorsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalIgnoreErrorsDataset.pbtxt
index 2cba2f1172c6e1..8fc4e7d2c8adee 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalIgnoreErrorsDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalIgnoreErrorsDataset.pbtxt
@@ -88,3 +88,53 @@ op {
     }
   }
 }
+op {
+  name: "ExperimentalIgnoreErrorsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "log_warning"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalLMDBDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalLMDBDataset.pbtxt
index 55e0d4382c645a..e8b8694a947d95 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalLMDBDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalLMDBDataset.pbtxt
@@ -53,3 +53,47 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalLMDBDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalLatencyStatsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalLatencyStatsDataset.pbtxt
index 1f6b5fb76e0855..29ba38a500c2b7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalLatencyStatsDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalLatencyStatsDataset.pbtxt
@@ -59,3 +59,50 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalLatencyStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMapAndBatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMapAndBatchDataset.pbtxt
index dad4a1d6079320..7799116408c237 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMapAndBatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMapAndBatchDataset.pbtxt
@@ -163,3 +163,78 @@ op {
     }
   }
 }
+op {
+  name: "ExperimentalMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMapDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMapDataset.pbtxt
index e231ebcf9cbf47..401331b716073d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMapDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMapDataset.pbtxt
@@ -148,3 +148,73 @@ op {
     }
   }
 }
+op {
+  name: "ExperimentalMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMaxIntraOpParallelismDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMaxIntraOpParallelismDataset.pbtxt
index 84eeb2a9de59ba..109f3906b31852 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMaxIntraOpParallelismDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalMaxIntraOpParallelismDataset.pbtxt
@@ -59,3 +59,50 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalMaxIntraOpParallelismDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "max_intra_op_parallelism"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalNonSerializableDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalNonSerializableDataset.pbtxt
index 78bb8e0a821bbb..b0c45ac19941ce 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalNonSerializableDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalNonSerializableDataset.pbtxt
@@ -51,3 +51,46 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalNonSerializableDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalParallelInterleaveDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalParallelInterleaveDataset.pbtxt
index 04b89f3f24c60d..a90031c9dad06a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalParallelInterleaveDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalParallelInterleaveDataset.pbtxt
@@ -117,3 +117,79 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalParallelInterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sloppy"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "buffer_output_elements"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "prefetch_input_elements"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalParseExampleDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalParseExampleDataset.pbtxt
index 066e2bdc1f67b4..44701f67286318 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalParseExampleDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalParseExampleDataset.pbtxt
@@ -229,3 +229,100 @@ op {
     }
   }
 }
+op {
+  name: "ExperimentalParseExampleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalPrivateThreadPoolDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalPrivateThreadPoolDataset.pbtxt
index c82b9328fe9848..3098c30ba69d11 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalPrivateThreadPoolDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalPrivateThreadPoolDataset.pbtxt
@@ -59,3 +59,50 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalPrivateThreadPoolDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_threads"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalRandomDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalRandomDataset.pbtxt
index d62f39d8aed792..c3276a46df5b41 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalRandomDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalRandomDataset.pbtxt
@@ -61,3 +61,51 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalRandomDataset"
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalRebatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalRebatchDataset.pbtxt
index 1061f4b84679a6..2c024741713d48 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalRebatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalRebatchDataset.pbtxt
@@ -100,3 +100,57 @@ op {
     }
   }
 }
+op {
+  name: "ExperimentalRebatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_replicas"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_fallback"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalScanDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalScanDataset.pbtxt
index 95701ab1959b1c..39d42061ef58d6 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalScanDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalScanDataset.pbtxt
@@ -157,3 +157,76 @@ op {
     }
   }
 }
+op {
+  name: "ExperimentalScanDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSetStatsAggregatorDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSetStatsAggregatorDataset.pbtxt
index d58a1502039d64..291597bf11b8dc 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSetStatsAggregatorDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSetStatsAggregatorDataset.pbtxt
@@ -77,3 +77,59 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalSetStatsAggregatorDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "stats_aggregator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "counter_prefix"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSleepDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSleepDataset.pbtxt
index d49fdcb7025bf9..806d764d9cdf12 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSleepDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSleepDataset.pbtxt
@@ -59,3 +59,50 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalSleepDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "sleep_microseconds"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSlidingWindowDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSlidingWindowDataset.pbtxt
index 0021b45e514300..ab18f4e214c578 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSlidingWindowDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSlidingWindowDataset.pbtxt
@@ -75,3 +75,58 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalSlidingWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "window_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_shift"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_stride"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSqlDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSqlDataset.pbtxt
index f522da3f71e080..f56ce488df0aba 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSqlDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalSqlDataset.pbtxt
@@ -69,3 +69,55 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalSqlDataset"
+  input_arg {
+    name: "driver_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data_source_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "query"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalTakeWhileDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalTakeWhileDataset.pbtxt
index d998468c5dd4aa..7c9b4f86adbbe4 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalTakeWhileDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalTakeWhileDataset.pbtxt
@@ -77,3 +77,59 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalTakeWhileDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "predicate"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalThreadPoolDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalThreadPoolDataset.pbtxt
index e3e8630979e1a2..da23c415fd24f0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalThreadPoolDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalThreadPoolDataset.pbtxt
@@ -61,3 +61,51 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalThreadPoolDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "thread_pool"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalUnbatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalUnbatchDataset.pbtxt
index 6491480e18f04c..83f3a39f5e9244 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalUnbatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalUnbatchDataset.pbtxt
@@ -51,3 +51,46 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalUnbatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalUniqueDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalUniqueDataset.pbtxt
index 47b3841b3ff377..95668c930d7269 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ExperimentalUniqueDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ExperimentalUniqueDataset.pbtxt
@@ -51,3 +51,46 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalUniqueDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FilterByLastComponentDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FilterByLastComponentDataset.pbtxt
index 30dd30b59cbf9a..cf9bbc586524a9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FilterByLastComponentDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FilterByLastComponentDataset.pbtxt
@@ -51,3 +51,46 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "FilterByLastComponentDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FilterDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FilterDataset.pbtxt
index b96af3d9b14db4..aad48d7aed4f62 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FilterDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FilterDataset.pbtxt
@@ -164,3 +164,66 @@ op {
     }
   }
 }
+op {
+  name: "FilterDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "predicate"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FinalizeDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FinalizeDataset.pbtxt
index 1b6a6d4c5e66f6..38e49288d662e1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FinalizeDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FinalizeDataset.pbtxt
@@ -65,3 +65,53 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "FinalizeDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "has_captured_ref"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/FlatMapDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/FlatMapDataset.pbtxt
index 9a7f6c4cd074fd..dcf1a7ae71c41f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/FlatMapDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/FlatMapDataset.pbtxt
@@ -164,3 +164,66 @@ op {
     }
   }
 }
+op {
+  name: "FlatMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GeneratorDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GeneratorDataset.pbtxt
index 352fbab82f61f0..9f8da9c542648e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GeneratorDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GeneratorDataset.pbtxt
@@ -196,3 +196,89 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "GeneratorDataset"
+  input_arg {
+    name: "init_func_other_args"
+    type_list_attr: "Tinit_func_args"
+  }
+  input_arg {
+    name: "next_func_other_args"
+    type_list_attr: "Tnext_func_args"
+  }
+  input_arg {
+    name: "finalize_func_other_args"
+    type_list_attr: "Tfinalize_func_args"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "init_func"
+    type: "func"
+  }
+  attr {
+    name: "next_func"
+    type: "func"
+  }
+  attr {
+    name: "finalize_func"
+    type: "func"
+  }
+  attr {
+    name: "Tinit_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tnext_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tfinalize_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GetElementAtIndex.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GetElementAtIndex.pbtxt
index 01782e5642601e..ae3da79559b6c1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GetElementAtIndex.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GetElementAtIndex.pbtxt
@@ -59,3 +59,50 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "GetElementAtIndex"
+  input_arg {
+    name: "dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "components"
+    type_list_attr: "output_types"
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GroupByReducerDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GroupByReducerDataset.pbtxt
index f8e81d104de692..320e628f8aabba 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GroupByReducerDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GroupByReducerDataset.pbtxt
@@ -157,3 +157,99 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "GroupByReducerDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "init_func_other_arguments"
+    type_list_attr: "Tinit_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "finalize_func_other_arguments"
+    type_list_attr: "Tfinalize_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "init_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "finalize_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tinit_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tfinalize_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GroupByWindowDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GroupByWindowDataset.pbtxt
index ed4c8c0a2867e9..0de0de53d0e7d3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/GroupByWindowDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/GroupByWindowDataset.pbtxt
@@ -205,3 +205,92 @@ op {
     }
   }
 }
+op {
+  name: "GroupByWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/IgnoreErrorsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/IgnoreErrorsDataset.pbtxt
index 3bc87f93417eec..32af3bb466528e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/IgnoreErrorsDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/IgnoreErrorsDataset.pbtxt
@@ -88,3 +88,53 @@ op {
     }
   }
 }
+op {
+  name: "IgnoreErrorsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "log_warning"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/InterleaveDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/InterleaveDataset.pbtxt
index f498f74c7d9ef0..124a84cf82fc55 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/InterleaveDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/InterleaveDataset.pbtxt
@@ -196,3 +196,74 @@ op {
     }
   }
 }
+op {
+  name: "InterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LMDBDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LMDBDataset.pbtxt
index fc87359a1e4aea..9ba1bd98191f8e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LMDBDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LMDBDataset.pbtxt
@@ -53,3 +53,47 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "LMDBDataset"
+  input_arg {
+    name: "filenames"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LatencyStatsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LatencyStatsDataset.pbtxt
index 5dec09cc8436df..546bcdcbd1233e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LatencyStatsDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LatencyStatsDataset.pbtxt
@@ -59,3 +59,50 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "LatencyStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LegacyParallelInterleaveDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LegacyParallelInterleaveDatasetV2.pbtxt
index 583259db8b006e..49f6a5574721c8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LegacyParallelInterleaveDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LegacyParallelInterleaveDatasetV2.pbtxt
@@ -196,3 +196,89 @@ op {
     }
   }
 }
+op {
+  name: "LegacyParallelInterleaveDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "buffer_output_elements"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "prefetch_input_elements"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "deterministic"
+    type: "string"
+    default_value {
+      s: "default"
+    }
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/LoadDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/LoadDataset.pbtxt
index bbba9bb66ffa08..c46f54ef3c53a1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/LoadDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/LoadDataset.pbtxt
@@ -93,3 +93,67 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "LoadDataset"
+  input_arg {
+    name: "path"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "reader_func_other_args"
+    type_list_attr: "Treader_func_args"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "compression"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reader_func"
+    type: "func"
+  }
+  attr {
+    name: "Treader_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MapAndBatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MapAndBatchDataset.pbtxt
index 42602c1b88fe9f..8e7b6a32493801 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MapAndBatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MapAndBatchDataset.pbtxt
@@ -184,3 +184,85 @@ op {
     }
   }
 }
+op {
+  name: "MapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MapDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MapDataset.pbtxt
index a729521ea35ec8..b01b535e48d6fd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MapDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MapDataset.pbtxt
@@ -285,3 +285,80 @@ op {
     }
   }
 }
+op {
+  name: "MapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MatrixTriangularSolve.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MatrixTriangularSolve.pbtxt
index 1755e7d38691a9..915e582d00a0e1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MatrixTriangularSolve.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MatrixTriangularSolve.pbtxt
@@ -120,3 +120,46 @@ op {
     }
   }
 }
+op {
+  name: "MatrixTriangularSolve"
+  input_arg {
+    name: "matrix"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rhs"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "lower"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "adjoint"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/MaxIntraOpParallelismDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/MaxIntraOpParallelismDataset.pbtxt
index 1ccfc50d6a8391..85547917acc6e3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/MaxIntraOpParallelismDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/MaxIntraOpParallelismDataset.pbtxt
@@ -59,3 +59,50 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "MaxIntraOpParallelismDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "max_intra_op_parallelism"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ModelDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ModelDataset.pbtxt
index 5c5f31a963db7e..14ac940f6a7764 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ModelDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ModelDataset.pbtxt
@@ -197,3 +197,67 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ModelDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "algorithm"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "cpu_budget"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "ram_budget"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/NonSerializableDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/NonSerializableDataset.pbtxt
index 709a9df83aa29d..5fbd4bdb656c57 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/NonSerializableDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/NonSerializableDataset.pbtxt
@@ -51,3 +51,46 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "NonSerializableDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OptimizeDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OptimizeDataset.pbtxt
index 4d71533e246b4c..5b40b213eb5dc8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OptimizeDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OptimizeDataset.pbtxt
@@ -102,3 +102,58 @@ op {
     }
   }
 }
+op {
+  name: "OptimizeDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "optimizations"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "optimization_configs"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OptimizeDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OptimizeDatasetV2.pbtxt
index c9cd45ecea8554..7e1537ebe57a3a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OptimizeDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OptimizeDatasetV2.pbtxt
@@ -91,3 +91,66 @@ op {
     }
   }
 }
+op {
+  name: "OptimizeDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "optimizations_enabled"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "optimizations_disabled"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "optimizations_default"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "optimization_configs"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/OptionsDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/OptionsDataset.pbtxt
index 25c465473d5098..fc63e5ee2951cf 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/OptionsDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/OptionsDataset.pbtxt
@@ -100,3 +100,57 @@ op {
     }
   }
 }
+op {
+  name: "OptionsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "serialized_options"
+    type: "string"
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PaddedBatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PaddedBatchDataset.pbtxt
index 46aa965f652e41..a118fc102f10a2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PaddedBatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PaddedBatchDataset.pbtxt
@@ -188,3 +188,72 @@ op {
     }
   }
 }
+op {
+  name: "PaddedBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "Toutput_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "Toutput_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PaddedBatchDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PaddedBatchDatasetV2.pbtxt
index 267d81245c0969..4ae5a66624a65d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PaddedBatchDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PaddedBatchDatasetV2.pbtxt
@@ -224,3 +224,83 @@ op {
     }
   }
 }
+op {
+  name: "PaddedBatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "Toutput_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "Toutput_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "parallel_copy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParallelBatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParallelBatchDataset.pbtxt
index d39b216361a188..5c160cae2ddad7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParallelBatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParallelBatchDataset.pbtxt
@@ -243,3 +243,79 @@ op {
     }
   }
 }
+op {
+  name: "ParallelBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "parallel_copy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "deterministic"
+    type: "string"
+    default_value {
+      s: "default"
+    }
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDataset.pbtxt
index ce2ec12a98a8bf..f278cb0efc6dfb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDataset.pbtxt
@@ -187,3 +187,86 @@ op {
     }
   }
 }
+op {
+  name: "ParallelInterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sloppy"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "buffer_output_elements"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "prefetch_input_elements"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV2.pbtxt
index 4fe7c4d7787dc3..110573b42ed39f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV2.pbtxt
@@ -232,3 +232,85 @@ op {
     }
   }
 }
+op {
+  name: "ParallelInterleaveDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV3.pbtxt
index 4b64d34ed9aa0a..096460fb1efcdb 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV3.pbtxt
@@ -184,3 +184,85 @@ op {
     }
   }
 }
+op {
+  name: "ParallelInterleaveDatasetV3"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "deterministic"
+    type: "string"
+    default_value {
+      s: "default"
+    }
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV4.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV4.pbtxt
index 96f3d267d60522..94f9ae0f6eebd1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV4.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParallelInterleaveDatasetV4.pbtxt
@@ -208,3 +208,93 @@ op {
     }
   }
 }
+op {
+  name: "ParallelInterleaveDatasetV4"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "buffer_output_elements"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "prefetch_input_elements"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "deterministic"
+    type: "string"
+    default_value {
+      s: "default"
+    }
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParallelMapDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParallelMapDataset.pbtxt
index 6dc65bb28b0b7f..991e4192983c3d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParallelMapDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParallelMapDataset.pbtxt
@@ -384,3 +384,91 @@ op {
     }
   }
 }
+op {
+  name: "ParallelMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParallelMapDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParallelMapDatasetV2.pbtxt
index db687a05492d1f..55e73b740adefd 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParallelMapDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParallelMapDatasetV2.pbtxt
@@ -202,3 +202,91 @@ op {
     }
   }
 }
+op {
+  name: "ParallelMapDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "deterministic"
+    type: "string"
+    default_value {
+      s: "default"
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParseExampleDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParseExampleDataset.pbtxt
index 09db14ef5652ee..4dc9ac1efb6cd3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParseExampleDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParseExampleDataset.pbtxt
@@ -239,3 +239,140 @@ op {
     }
   }
 }
+op {
+  name: "ParseExampleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "ragged_keys"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "ragged_value_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "ragged_split_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ParseExampleDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ParseExampleDatasetV2.pbtxt
index 4a5dd5cf9568d5..59632a160b121c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ParseExampleDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ParseExampleDatasetV2.pbtxt
@@ -239,3 +239,140 @@ op {
     }
   }
 }
+op {
+  name: "ParseExampleDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "deterministic"
+    type: "string"
+    default_value {
+      s: "default"
+    }
+  }
+  attr {
+    name: "ragged_keys"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+  }
+  attr {
+    name: "ragged_value_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "ragged_split_types"
+    type: "list(type)"
+    default_value {
+      list {
+      }
+    }
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PrefetchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PrefetchDataset.pbtxt
index 1a50e4522c47a3..81953a0e75c0c9 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PrefetchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PrefetchDataset.pbtxt
@@ -259,3 +259,78 @@ op {
     }
   }
 }
+op {
+  name: "PrefetchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "slack_period"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "legacy_autotune"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "buffer_size_min"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/PrivateThreadPoolDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/PrivateThreadPoolDataset.pbtxt
index f5385ea6aa7466..c16c1eb164728d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/PrivateThreadPoolDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/PrivateThreadPoolDataset.pbtxt
@@ -59,3 +59,50 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "PrivateThreadPoolDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_threads"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RandomDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RandomDataset.pbtxt
index b0a303654224e7..d646d19b2e5ef2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RandomDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RandomDataset.pbtxt
@@ -103,3 +103,58 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RandomDataset"
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RangeDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RangeDataset.pbtxt
index de685531968ed9..3c612009a90c39 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RangeDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RangeDataset.pbtxt
@@ -115,3 +115,62 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "RangeDataset"
+  input_arg {
+    name: "start"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "stop"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "step"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RebatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RebatchDataset.pbtxt
index 5e4872f38a8c80..176f94a3329c2b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RebatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RebatchDataset.pbtxt
@@ -73,3 +73,57 @@ op {
     }
   }
 }
+op {
+  name: "RebatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_replicas"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_fallback"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RebatchDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RebatchDatasetV2.pbtxt
index 90eec7f4468378..fd853b65b2685c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RebatchDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RebatchDatasetV2.pbtxt
@@ -67,3 +67,54 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "RebatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_sizes"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RepeatDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RepeatDataset.pbtxt
index fbc1e433f746d6..b7914feb4dfbc7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RepeatDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RepeatDataset.pbtxt
@@ -128,3 +128,57 @@ op {
     }
   }
 }
+op {
+  name: "RepeatDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SamplingDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SamplingDataset.pbtxt
index 7d29c3954d8f46..160a9e9bb16588 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SamplingDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SamplingDataset.pbtxt
@@ -75,3 +75,58 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "SamplingDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "rate"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ScanDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ScanDataset.pbtxt
index bc606cb4aa9ef3..25de8c51a7a388 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ScanDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ScanDataset.pbtxt
@@ -199,3 +199,90 @@ op {
     }
   }
 }
+op {
+  name: "ScanDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_default_device"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SetStatsAggregatorDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SetStatsAggregatorDataset.pbtxt
index b74f676dc6a6a0..fa2dfd389adb8c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SetStatsAggregatorDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SetStatsAggregatorDataset.pbtxt
@@ -77,3 +77,59 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "SetStatsAggregatorDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "stats_aggregator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "counter_prefix"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ShardDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ShardDataset.pbtxt
index dc8c17a72165c8..c23a4d3d2e3f98 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ShardDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ShardDataset.pbtxt
@@ -164,3 +164,68 @@ op {
     }
   }
 }
+op {
+  name: "ShardDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_shards"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "require_non_empty"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDataset.pbtxt
index f8821b6bf33d41..195d66b2ab8956 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDataset.pbtxt
@@ -196,3 +196,76 @@ op {
     }
   }
 }
+op {
+  name: "ShuffleAndRepeatDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "reshuffle_each_iteration"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDatasetV2.pbtxt
index 1a0bf2fe122051..1d22404cf064e0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ShuffleAndRepeatDatasetV2.pbtxt
@@ -172,3 +172,81 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ShuffleAndRepeatDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed_generator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "reshuffle_each_iteration"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ShuffleDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ShuffleDataset.pbtxt
index c9b59106844e8d..35c0aa70c11696 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ShuffleDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ShuffleDataset.pbtxt
@@ -216,3 +216,72 @@ op {
     }
   }
 }
+op {
+  name: "ShuffleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "reshuffle_each_iteration"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ShuffleDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ShuffleDatasetV2.pbtxt
index 6096ae82145eef..9ec7fa282d6307 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ShuffleDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ShuffleDatasetV2.pbtxt
@@ -115,3 +115,62 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ShuffleDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed_generator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ShuffleDatasetV3.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ShuffleDatasetV3.pbtxt
index 13167b95ee0ef1..e037b818d4ffe1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ShuffleDatasetV3.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ShuffleDatasetV3.pbtxt
@@ -160,3 +160,77 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ShuffleDatasetV3"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "buffer_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed_generator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "reshuffle_each_iteration"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SkipDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SkipDataset.pbtxt
index b23a1e75c81230..07e0cf257f87ce 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SkipDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SkipDataset.pbtxt
@@ -128,3 +128,57 @@ op {
     }
   }
 }
+op {
+  name: "SkipDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SleepDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SleepDataset.pbtxt
index 9559de9126a128..0a1d637995e146 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SleepDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SleepDataset.pbtxt
@@ -59,3 +59,50 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "SleepDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "sleep_microseconds"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SlidingWindowDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SlidingWindowDataset.pbtxt
index 9c6f666c249e49..ab63899bf4fb51 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SlidingWindowDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SlidingWindowDataset.pbtxt
@@ -124,3 +124,65 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "SlidingWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "window_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_shift"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_stride"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "drop_remainder"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SnapshotDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SnapshotDataset.pbtxt
index 7060010b8483c2..6d9002761ae02f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SnapshotDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SnapshotDataset.pbtxt
@@ -255,3 +255,148 @@ op {
     }
   }
 }
+op {
+  name: "SnapshotDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "path"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "compression"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reader_path_prefix"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "writer_path_prefix"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shard_size_bytes"
+    type: "int"
+    default_value {
+      i: 10737418240
+    }
+  }
+  attr {
+    name: "pending_snapshot_expiry_seconds"
+    type: "int"
+    default_value {
+      i: 86400
+    }
+  }
+  attr {
+    name: "num_reader_threads"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "reader_buffer_size"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "num_writer_threads"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "writer_buffer_size"
+    type: "int"
+    default_value {
+      i: 1
+    }
+  }
+  attr {
+    name: "shuffle_on_read"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "seed"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "seed2"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "mode"
+    type: "string"
+    default_value {
+      s: "auto"
+    }
+  }
+  attr {
+    name: "snapshot_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetReader.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetReader.pbtxt
index 053f759b47c9a4..d59d8edf2fc492 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetReader.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetReader.pbtxt
@@ -81,3 +81,61 @@ op {
     type: "int"
   }
 }
+op {
+  name: "SnapshotDatasetReader"
+  input_arg {
+    name: "shard_dir"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "start_index"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "compression"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "version"
+    type: "int"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt
index 90ad3322a3094b..c9e244ed0e9099 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt
@@ -417,3 +417,118 @@ op {
     }
   }
 }
+op {
+  name: "SnapshotDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "path"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "reader_func_other_args"
+    type_list_attr: "Treader_func_args"
+  }
+  input_arg {
+    name: "shard_func_other_args"
+    type_list_attr: "Tshard_func_args"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "compression"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reader_prefix"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "writer_prefix"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "hash_valid"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "hash"
+    type: "int"
+    default_value {
+      i: 0
+    }
+  }
+  attr {
+    name: "reader_func"
+    type: "func"
+  }
+  attr {
+    name: "shard_func"
+    type: "func"
+  }
+  attr {
+    name: "Treader_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tshard_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SnapshotNestedDatasetReader.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SnapshotNestedDatasetReader.pbtxt
index 1e312eb9ed258f..078460bcb23930 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SnapshotNestedDatasetReader.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SnapshotNestedDatasetReader.pbtxt
@@ -65,3 +65,53 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "SnapshotNestedDatasetReader"
+  input_arg {
+    name: "inputs"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseTensorSliceDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseTensorSliceDataset.pbtxt
index 505d79c4ad52ad..af26fd8c180a3f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseTensorSliceDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseTensorSliceDataset.pbtxt
@@ -53,3 +53,37 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "SparseTensorSliceDataset"
+  input_arg {
+    name: "indices"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "values"
+    type_attr: "Tvalues"
+  }
+  input_arg {
+    name: "dense_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "Tvalues"
+        }
+      }
+    }
+  }
+  attr {
+    name: "Tvalues"
+    type: "type"
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SqlDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SqlDataset.pbtxt
index 1f30742e7cbdaa..68af0ac17eff32 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SqlDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SqlDataset.pbtxt
@@ -69,3 +69,55 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "SqlDataset"
+  input_arg {
+    name: "driver_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data_source_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "query"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TakeDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TakeDataset.pbtxt
index 2450c265f81df8..8ced9c67054cc1 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TakeDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TakeDataset.pbtxt
@@ -128,3 +128,57 @@ op {
     }
   }
 }
+op {
+  name: "TakeDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "count"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TakeWhileDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TakeWhileDataset.pbtxt
index 22a35167413e7b..bfde2664966ef2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TakeWhileDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TakeWhileDataset.pbtxt
@@ -127,3 +127,66 @@ op {
     }
   }
 }
+op {
+  name: "TakeWhileDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "predicate"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorDataset.pbtxt
index 2b67de48b18847..9e71deef2c597c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorDataset.pbtxt
@@ -91,3 +91,54 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorDataset"
+  input_arg {
+    name: "components"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "Toutput_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "Toutput_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListConcatLists.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListConcatLists.pbtxt
index 06a19bd7e19394..57dd05a90feb7d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListConcatLists.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListConcatLists.pbtxt
@@ -43,3 +43,32 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListConcatLists"
+  input_arg {
+    name: "input_a"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "input_b"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_ARRAY
+      args {
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
+      }
+    }
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListFromTensor.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListFromTensor.pbtxt
index 4c1286f3d687a2..c2be2938c8607e 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListFromTensor.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListFromTensor.pbtxt
@@ -63,3 +63,42 @@ op {
     }
   }
 }
+op {
+  name: "TensorListFromTensor"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_ARRAY
+      args {
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
+      }
+    }
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListPushBack.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListPushBack.pbtxt
index 5a06e722d0e95b..8175cfe350dab0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListPushBack.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListPushBack.pbtxt
@@ -43,3 +43,32 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListPushBack"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_ARRAY
+      args {
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
+      }
+    }
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListPushBackBatch.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListPushBackBatch.pbtxt
index 4d37daf287b113..29b878e527a4d5 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListPushBackBatch.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListPushBackBatch.pbtxt
@@ -43,3 +43,32 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListPushBackBatch"
+  input_arg {
+    name: "input_handles"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "output_handles"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_ARRAY
+      args {
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
+      }
+    }
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListReserve.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListReserve.pbtxt
index 2aa3d2001daea5..98ade8cb4a9fd0 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListReserve.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListReserve.pbtxt
@@ -63,3 +63,42 @@ op {
     }
   }
 }
+op {
+  name: "TensorListReserve"
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "num_elements"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_ARRAY
+      args {
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
+      }
+    }
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListScatter.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListScatter.pbtxt
index 4b167e49cf2da1..daa2f4130ab06b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListScatter.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListScatter.pbtxt
@@ -71,3 +71,46 @@ op {
     }
   }
 }
+op {
+  name: "TensorListScatter"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_ARRAY
+      args {
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
+      }
+    }
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListScatterIntoExistingList.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListScatterIntoExistingList.pbtxt
index 144a44717b4667..4427bab8a358c2 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListScatterIntoExistingList.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListScatterIntoExistingList.pbtxt
@@ -51,3 +51,36 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListScatterIntoExistingList"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_ARRAY
+      args {
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
+      }
+    }
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListScatterV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListScatterV2.pbtxt
index 3394ddc49bcaca..de588984614839 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListScatterV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListScatterV2.pbtxt
@@ -79,3 +79,50 @@ op {
     }
   }
 }
+op {
+  name: "TensorListScatterV2"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "indices"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "num_elements"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_ARRAY
+      args {
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
+      }
+    }
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListSetItem.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListSetItem.pbtxt
index f664cefcdbcb56..9550fa378f6aac 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListSetItem.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListSetItem.pbtxt
@@ -51,3 +51,36 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListSetItem"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "index"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "item"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_ARRAY
+      args {
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
+      }
+    }
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorListSplit.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorListSplit.pbtxt
index bce7e6c20cc8e6..ff83247addf89b 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorListSplit.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorListSplit.pbtxt
@@ -71,3 +71,46 @@ op {
     }
   }
 }
+op {
+  name: "TensorListSplit"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_ARRAY
+      args {
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
+      }
+    }
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/TensorSliceDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/TensorSliceDataset.pbtxt
index 1f53c02ada1928..905ed98cee3293 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/TensorSliceDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/TensorSliceDataset.pbtxt
@@ -136,3 +136,61 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorSliceDataset"
+  input_arg {
+    name: "components"
+    type_list_attr: "Toutput_types"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "Toutput_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "Toutput_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "is_files"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ThreadPoolDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ThreadPoolDataset.pbtxt
index 746a6fd0b3f792..8e185af579f2be 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ThreadPoolDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ThreadPoolDataset.pbtxt
@@ -61,3 +61,51 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ThreadPoolDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "thread_pool"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UnbatchDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UnbatchDataset.pbtxt
index 0513c0c53a9dea..84479c117206d7 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UnbatchDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UnbatchDataset.pbtxt
@@ -88,3 +88,53 @@ op {
     }
   }
 }
+op {
+  name: "UnbatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/UniqueDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/UniqueDataset.pbtxt
index 891a9131922ed2..281ba7c1bd0619 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/UniqueDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/UniqueDataset.pbtxt
@@ -88,3 +88,53 @@ op {
     }
   }
 }
+op {
+  name: "UniqueDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/WindowDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/WindowDataset.pbtxt
index 143f0e9b0ac027..43784faad6dc62 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/WindowDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/WindowDataset.pbtxt
@@ -136,3 +136,69 @@ op {
     }
   }
 }
+op {
+  name: "WindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "shift"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "stride"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/WindowOp.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/WindowOp.pbtxt
index 8aef888cdcaf39..336a13805eaf6a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/WindowOp.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/WindowOp.pbtxt
@@ -63,3 +63,52 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "WindowOp"
+  input_arg {
+    name: "inputs"
+    type_list_attr: "Tinputs"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tinputs"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ZipDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ZipDataset.pbtxt
index 38ef7b790cea6e..5dd34535a666d3 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ZipDataset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ZipDataset.pbtxt
@@ -140,3 +140,60 @@ op {
     }
   }
 }
+op {
+  name: "ZipDataset"
+  input_arg {
+    name: "input_datasets"
+    type: DT_VARIANT
+    number_attr: "N"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index ad3d3c2ce1ede8..a66c8d357c7327 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -43,7 +43,8 @@ REGISTER_OP("TensorDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
     .SetDoNotOptimize()  // TODO(b/123753214): See comment in dataset_ops.cc.
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "Toutput_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "Toutput_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 // TODO(mrry): Validate that the dim-0 slices of `components` have shapes
@@ -56,7 +57,8 @@ REGISTER_OP("TensorSliceDataset")
     .Attr("is_files: bool = false")
     .Attr("metadata: string = ''")
     .SetDoNotOptimize()  // TODO(b/123753214): See comment in dataset_ops.cc.
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "Toutput_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "Toutput_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("SparseTensorSliceDataset")
@@ -66,7 +68,7 @@ REGISTER_OP("SparseTensorSliceDataset")
     .Output("handle: variant")
     .Attr("Tvalues: type")
     .SetDoNotOptimize()  // TODO(b/123753214): See comment in dataset_ops.cc.
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "Tvalues"))
+    .SetTypeConstructor(full_type::UnaryTensorContainer(TFT_DATASET, "Tvalues"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("GeneratorDataset")
@@ -84,7 +86,8 @@ REGISTER_OP("GeneratorDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
     .SetDoNotOptimize()  // TODO(b/123753214): See comment in dataset_ops.cc.
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ZipDataset")
@@ -94,7 +97,8 @@ REGISTER_OP("ZipDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("N: int >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ConcatenateDataset")
@@ -104,7 +108,8 @@ REGISTER_OP("ConcatenateDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("RepeatDataset")
@@ -114,7 +119,8 @@ REGISTER_OP("RepeatDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle count_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &count_shape));
@@ -128,7 +134,8 @@ REGISTER_OP("TakeDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle count_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &count_shape));
@@ -142,7 +149,8 @@ REGISTER_OP("SkipDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle count_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &count_shape));
@@ -160,7 +168,8 @@ REGISTER_OP("MapDataset")
     .Attr("use_inter_op_parallelism: bool = true")
     .Attr("preserve_cardinality: bool = false")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ParallelMapDataset")
@@ -176,7 +185,8 @@ REGISTER_OP("ParallelMapDataset")
     .Attr("sloppy: bool = false")
     .Attr("preserve_cardinality: bool = false")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ParallelMapDatasetV2")
@@ -193,7 +203,8 @@ REGISTER_OP("ParallelMapDatasetV2")
     .Attr("deterministic: string = 'default'")
     .Attr("preserve_cardinality: bool = false")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("PrefetchDataset")
@@ -206,7 +217,8 @@ REGISTER_OP("PrefetchDataset")
     .Attr("legacy_autotune: bool = true")
     .Attr("buffer_size_min: int = 0")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // buffer_size should be a scalar.
@@ -223,7 +235,8 @@ REGISTER_OP("FlatMapDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("InterleaveDataset")
@@ -237,7 +250,8 @@ REGISTER_OP("InterleaveDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ParallelInterleaveDatasetV2")
@@ -253,7 +267,8 @@ REGISTER_OP("ParallelInterleaveDatasetV2")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("sloppy: bool = false")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ParallelInterleaveDatasetV3")
@@ -270,7 +285,8 @@ REGISTER_OP("ParallelInterleaveDatasetV3")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 // Like V3, but adds buffer_output_elements and prefetch_input_elements.
@@ -290,7 +306,8 @@ REGISTER_OP("ParallelInterleaveDatasetV4")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("FilterDataset")
@@ -302,7 +319,8 @@ REGISTER_OP("FilterDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 // This op is no longer supported.
@@ -311,7 +329,8 @@ REGISTER_OP("FilterByLastComponentDataset")
     .Output("output: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("WindowDataset")
@@ -324,7 +343,8 @@ REGISTER_OP("WindowDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // size, shift, stride, and drop_remainder should be scalars.
@@ -341,7 +361,8 @@ REGISTER_OP("WindowOp")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("Tinputs: list(type) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("BatchDataset")
@@ -351,7 +372,8 @@ REGISTER_OP("BatchDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // batch_size should be a scalar.
@@ -368,7 +390,8 @@ REGISTER_OP("BatchDatasetV2")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // batch_size should be a scalar.
@@ -390,7 +413,8 @@ REGISTER_OP("ParallelBatchDataset")
     // "true", "false", or "default".
     .Attr("deterministic: string = 'default'")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // batch_size should be a scalar.
@@ -411,7 +435,8 @@ REGISTER_OP("ShardDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // num_shards should be a scalar.
@@ -435,7 +460,8 @@ REGISTER_OP("PaddedBatchDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("N: int >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "Toutput_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "Toutput_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // batch_size should be a scalar.
@@ -455,7 +481,8 @@ REGISTER_OP("PaddedBatchDatasetV2")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("N: int >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "Toutput_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "Toutput_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // batch_size should be a scalar.
@@ -475,7 +502,8 @@ REGISTER_OP("RangeDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
     .SetDoNotOptimize()  // TODO(b/123753214): See comment in dataset_ops.cc.
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // start, stop, and step should be scalars.
@@ -542,7 +570,8 @@ REGISTER_OP("ShuffleDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // buffer_size, seed, and seed2 should be scalars.
@@ -560,7 +589,8 @@ REGISTER_OP("ShuffleDatasetV2")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // buffer_size and seed_generator should be scalars.
@@ -580,7 +610,8 @@ REGISTER_OP("ShuffleDatasetV3")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // buffer_size, seed, seed2, and seed_generator should be scalars.
@@ -602,7 +633,8 @@ REGISTER_OP("ShuffleAndRepeatDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("reshuffle_each_iteration: bool = true")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // buffer_size, seed, seed2, and count should be scalars.
@@ -625,7 +657,8 @@ REGISTER_OP("ShuffleAndRepeatDatasetV2")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // buffer_size, seed, seed2, count, and seed_generator should be scalars.
@@ -666,7 +699,8 @@ REGISTER_OP("CacheDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
     // TODO(mdan): Should these use type inference instead?
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // filename should be a scalar.
@@ -682,7 +716,8 @@ REGISTER_OP("CacheDatasetV2")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // filename should be a scalar.
@@ -951,7 +986,8 @@ REGISTER_OP("OptimizeDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("optimization_configs: list(string) = []")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("OptimizeDatasetV2")
@@ -963,7 +999,8 @@ REGISTER_OP("OptimizeDatasetV2")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("optimization_configs: list(string) = []")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("OptionalFromValue")
@@ -1029,7 +1066,8 @@ REGISTER_OP("ModelDataset")
     .Attr("ram_budget: int = 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 // TODO(b/124308749): Add a stateful version of MapDefun and use it when `f`
@@ -1152,7 +1190,8 @@ REGISTER_OP("OptionsDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("GetOptions")
@@ -1166,7 +1205,8 @@ REGISTER_OP("FinalizeDataset")
     .Attr("has_captured_ref: bool = false")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index 14527ec8d3293e..6aca27f18d6189 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -23,7 +23,8 @@ REGISTER_OP("AssertCardinalityDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // cardinality should be a scalar.
@@ -37,7 +38,8 @@ REGISTER_OP("AssertNextDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // transformations should be a vector.
@@ -51,7 +53,8 @@ REGISTER_OP("ExperimentalAssertNextDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // transformations should be a vector.
@@ -68,7 +71,8 @@ REGISTER_OP("AutoShardDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("num_replicas: int = 0")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalAutoShardDataset")
@@ -79,7 +83,8 @@ REGISTER_OP("ExperimentalAutoShardDataset")
     .Attr("auto_shard_policy: int = 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("BytesProducedStatsDataset")
@@ -88,7 +93,8 @@ REGISTER_OP("BytesProducedStatsDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle tag_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
@@ -101,7 +107,8 @@ REGISTER_OP("ExperimentalBytesProducedStatsDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle tag_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
@@ -120,7 +127,8 @@ REGISTER_OP("ChooseFastestBranchDataset")
     .Attr("other_arguments_lengths: list(int) >= 1")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ChooseFastestDataset")
@@ -130,7 +138,8 @@ REGISTER_OP("ChooseFastestDataset")
     .Attr("num_experiments: int")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalChooseFastestDataset")
@@ -140,7 +149,8 @@ REGISTER_OP("ExperimentalChooseFastestDataset")
     .Attr("num_experiments: int")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("CompressElement")
@@ -175,7 +185,8 @@ REGISTER_OP("CSVDataset")
     .Attr("output_types: list({float,double,int32,int64,string}) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .SetDoNotOptimize()  // TODO(b/123753214): See comment in dataset_ops.cc.
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // `filenames` must be a scalar or a vector.
@@ -218,7 +229,8 @@ REGISTER_OP("CSVDatasetV2")
     .Attr("output_types: list({float,double,int32,int64,string}) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .SetDoNotOptimize()  // TODO(b/123753214): See comment in dataset_ops.cc.
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // `filenames` must be a scalar or a vector.
@@ -263,7 +275,8 @@ REGISTER_OP("ExperimentalCSVDataset")
     .Attr("output_types: list({float,double,int32,int64,string}) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .SetDoNotOptimize()  // TODO(b/123753214): See comment in dataset_ops.cc.
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // `filenames` must be a scalar or a vector.
@@ -327,7 +340,8 @@ REGISTER_OP("DenseToSparseBatchDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // batch_size should be a scalar.
@@ -344,7 +358,8 @@ REGISTER_OP("ExperimentalDenseToSparseBatchDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // batch_size should be a scalar.
@@ -362,7 +377,8 @@ REGISTER_OP("DirectedInterleaveDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("N: int >= 1")
     .Attr("stop_on_empty_dataset: bool = false")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalDirectedInterleaveDataset")
@@ -372,7 +388,8 @@ REGISTER_OP("ExperimentalDirectedInterleaveDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("N: int >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("GroupByReducerDataset")
@@ -393,7 +410,8 @@ REGISTER_OP("GroupByReducerDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .SetIsStateful()
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalGroupByReducerDataset")
@@ -414,7 +432,8 @@ REGISTER_OP("ExperimentalGroupByReducerDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .SetIsStateful()
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("GroupByWindowDataset")
@@ -433,7 +452,8 @@ REGISTER_OP("GroupByWindowDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("GetElementAtIndex")
@@ -442,7 +462,8 @@ REGISTER_OP("GetElementAtIndex")
     .Output("components: output_types")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::DatasetIteratorShape);
 
 REGISTER_OP("ExperimentalGroupByWindowDataset")
@@ -460,7 +481,8 @@ REGISTER_OP("ExperimentalGroupByWindowDataset")
     .Attr("Twindow_size_func_other_arguments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("IgnoreErrorsDataset")
@@ -469,7 +491,8 @@ REGISTER_OP("IgnoreErrorsDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("log_warning: bool = false")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalIgnoreErrorsDataset")
@@ -478,7 +501,8 @@ REGISTER_OP("ExperimentalIgnoreErrorsDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("log_warning: bool = false")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("IteratorGetDevice")
@@ -497,7 +521,8 @@ REGISTER_OP("LatencyStatsDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle tag_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
@@ -510,7 +535,8 @@ REGISTER_OP("ExperimentalLatencyStatsDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle tag_shape;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
@@ -523,7 +549,8 @@ REGISTER_OP("LMDBDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .SetDoNotOptimize()  // TODO(b/123753214): See comment in dataset_ops.cc.
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalLMDBDataset")
@@ -532,7 +559,8 @@ REGISTER_OP("ExperimentalLMDBDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .SetDoNotOptimize()  // TODO(b/123753214): See comment in dataset_ops.cc.
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("MapAndBatchDataset")
@@ -548,7 +576,8 @@ REGISTER_OP("MapAndBatchDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("preserve_cardinality: bool = false")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       // Use index from the end to retrieve the Input shapes,
       // so that to avoid guessing the length of "other_arguments".
@@ -576,7 +605,8 @@ REGISTER_OP("ExperimentalMapAndBatchDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("preserve_cardinality: bool = false")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       // Use index from the end to retrieve the Input shapes,
       // so that to avoid guessing the length of "other_arguments".
@@ -602,7 +632,8 @@ REGISTER_OP("ExperimentalMapDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("use_inter_op_parallelism: bool = true")
     .Attr("preserve_cardinality: bool = false")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("MatchingFilesDataset")
@@ -637,7 +668,8 @@ REGISTER_OP("MaxIntraOpParallelismDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalMaxIntraOpParallelismDataset")
@@ -646,7 +678,8 @@ REGISTER_OP("ExperimentalMaxIntraOpParallelismDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("NonSerializableDataset")
@@ -654,7 +687,8 @@ REGISTER_OP("NonSerializableDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalNonSerializableDataset")
@@ -662,7 +696,8 @@ REGISTER_OP("ExperimentalNonSerializableDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ParallelInterleaveDataset")
@@ -679,7 +714,8 @@ REGISTER_OP("ParallelInterleaveDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 // This is the V2 of ParallelInterleaveDataset, renamed to differentiate it
@@ -699,7 +735,8 @@ REGISTER_OP("LegacyParallelInterleaveDatasetV2")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 // This op is no longer used. We keep it so that we can read graphs written by
@@ -717,7 +754,8 @@ REGISTER_OP("ExperimentalParallelInterleaveDataset")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ParseExampleDataset")
@@ -738,7 +776,8 @@ REGISTER_OP("ParseExampleDataset")
     .Attr("ragged_keys: list(string) >= 0 = []")
     .Attr("ragged_value_types: list({float,int64,string}) >= 0 = []")
     .Attr("ragged_split_types: list({int32,int64}) >= 0 = []")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ParseExampleDatasetV2")
@@ -760,7 +799,8 @@ REGISTER_OP("ParseExampleDatasetV2")
     .Attr("ragged_keys: list(string) >= 0 = []")
     .Attr("ragged_value_types: list({float,int64,string}) >= 0 = []")
     .Attr("ragged_split_types: list({int32,int64}) >= 0 = []")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalParseExampleDataset")
@@ -778,7 +818,8 @@ REGISTER_OP("ExperimentalParseExampleDataset")
                                               // sorted by key (dense_keys and
                                               // sparse_keys combined) here.
     .Attr("sloppy: bool = false")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("PrivateThreadPoolDataset")
@@ -787,7 +828,8 @@ REGISTER_OP("PrivateThreadPoolDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalPrivateThreadPoolDataset")
@@ -796,7 +838,8 @@ REGISTER_OP("ExperimentalPrivateThreadPoolDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalRandomDataset")
@@ -806,7 +849,8 @@ REGISTER_OP("ExperimentalRandomDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .SetDoNotOptimize()  // TODO(b/123753214): See comment in dataset_ops.cc.
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // buffer_size, seed, and seed2 should be scalars.
@@ -823,7 +867,8 @@ REGISTER_OP("RandomDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
     .SetDoNotOptimize()  // TODO(b/123753214): See comment in dataset_ops.cc.
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // buffer_size, seed, and seed2 should be scalars.
@@ -839,7 +884,8 @@ REGISTER_OP("ExperimentalRebatchDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("use_fallback: bool = true")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("RebatchDataset")
@@ -849,7 +895,8 @@ REGISTER_OP("RebatchDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("use_fallback: bool = true")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("RebatchDatasetV2")
@@ -859,7 +906,8 @@ REGISTER_OP("RebatchDatasetV2")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("SamplingDataset")
@@ -870,7 +918,8 @@ REGISTER_OP("SamplingDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // rate, seed, and seed2 should be scalars.
@@ -893,7 +942,8 @@ REGISTER_OP("ScanDataset")
     .Attr("preserve_cardinality: bool = false")
     .Attr("use_default_device: bool = true")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalScanDataset")
@@ -907,7 +957,8 @@ REGISTER_OP("ExperimentalScanDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("preserve_cardinality: bool = false")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("SetStatsAggregatorDataset")
@@ -918,7 +969,8 @@ REGISTER_OP("SetStatsAggregatorDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalSetStatsAggregatorDataset")
@@ -929,7 +981,8 @@ REGISTER_OP("ExperimentalSetStatsAggregatorDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("SleepDataset")
@@ -938,7 +991,8 @@ REGISTER_OP("SleepDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // Both inputs are scalar.
@@ -953,7 +1007,8 @@ REGISTER_OP("ExperimentalSleepDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // Both inputs are scalar.
@@ -971,7 +1026,8 @@ REGISTER_OP("SlidingWindowDataset")
     .Attr("drop_remainder: bool = true")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // window_size, window_shift, and window_stride should be scalars.
@@ -989,7 +1045,8 @@ REGISTER_OP("ExperimentalSlidingWindowDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // window_size, window_shift, and window_stride should be scalars.
@@ -1019,7 +1076,8 @@ REGISTER_OP("SnapshotDataset")
     .Attr("seed2: int = 0")
     .Attr("mode: string = 'auto'")
     .Attr("snapshot_name: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // snapshot_path should be a scalar.
@@ -1045,7 +1103,8 @@ REGISTER_OP("SnapshotDatasetV2")
     .Attr("Treader_func_args: list(type) >= 0")
     .Attr("Tshard_func_args: list(type) >= 0")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // `path` should be a scalar.
@@ -1098,7 +1157,8 @@ REGISTER_OP("LoadDataset")
     .Attr("reader_func: func")
     .Attr("Treader_func_args: list(type) >= 0")
     .SetIsStateful()
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // `path` should be a scalar.
@@ -1114,7 +1174,8 @@ REGISTER_OP("SnapshotDatasetReader")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("compression: string = ''")
     .Attr("version: int")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // `shard_dir` should be a scalar.
@@ -1130,7 +1191,8 @@ REGISTER_OP("SnapshotNestedDatasetReader")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("N: int >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("SqlDataset")
@@ -1141,7 +1203,8 @@ REGISTER_OP("SqlDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .SetDoNotOptimize()  // TODO(b/123753214): See comment in dataset_ops.cc.
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // driver_name, data_source_name, and query should be scalars.
@@ -1159,7 +1222,8 @@ REGISTER_OP("ExperimentalSqlDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .SetDoNotOptimize()  // TODO(b/123753214): See comment in dataset_ops.cc.
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle unused;
       // driver_name, data_source_name, and query should be scalars.
@@ -1211,7 +1275,8 @@ REGISTER_OP("TakeWhileDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalTakeWhileDataset")
@@ -1222,7 +1287,8 @@ REGISTER_OP("ExperimentalTakeWhileDataset")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ThreadPoolDataset")
@@ -1231,7 +1297,8 @@ REGISTER_OP("ThreadPoolDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalThreadPoolDataset")
@@ -1240,7 +1307,8 @@ REGISTER_OP("ExperimentalThreadPoolDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ThreadPoolHandle")
@@ -1267,7 +1335,8 @@ REGISTER_OP("UnbatchDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalUnbatchDataset")
@@ -1275,7 +1344,8 @@ REGISTER_OP("ExperimentalUnbatchDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("UniqueDataset")
@@ -1284,7 +1354,8 @@ REGISTER_OP("UniqueDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("metadata: string = ''")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalUniqueDataset")
@@ -1292,7 +1363,8 @@ REGISTER_OP("ExperimentalUniqueDataset")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("DummyIterationCounter")
@@ -1317,7 +1389,8 @@ REGISTER_OP("DataServiceDataset")
     .Attr("data_transfer_protocol: string = ''")
     .Attr("target_workers: string = 'AUTO'")
     .SetIsStateful()
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 // Adds `consumer_index` and `num_consumers` arguments to support round-robin
@@ -1339,7 +1412,8 @@ REGISTER_OP("DataServiceDatasetV2")
     .Attr("data_transfer_protocol: string = ''")
     .Attr("target_workers: string = 'AUTO'")
     .SetIsStateful()
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 // Adds `uncompress` and `uncompress_fn` attributes to support uncompression.
@@ -1362,7 +1436,8 @@ REGISTER_OP("DataServiceDatasetV3")
     .Attr("uncompress: bool = false")
     .Attr("uncompress_fn: func")
     .SetIsStateful()
-    .SetTypeConstructor(full_type::Unary(TFT_DATASET, "output_types"))
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("RegisterDataset")
diff --git a/tensorflow/core/ops/linalg_ops.cc b/tensorflow/core/ops/linalg_ops.cc
index 78877b9fc74411..b567afc43536f8 100644
--- a/tensorflow/core/ops/linalg_ops.cc
+++ b/tensorflow/core/ops/linalg_ops.cc
@@ -506,7 +506,7 @@ REGISTER_OP("MatrixTriangularSolve")
     .Output("output: T")
     .Attr("lower: bool = True")
     .Attr("adjoint: bool = False")
-    .Attr("T: {double, float, half, complex64, complex128}")
+    .Attr("T: {bfloat16, double, float, half, complex64, complex128}")
     .SetShapeFn([](InferenceContext* c) {
       return MatrixTriangularSolveShapeFn(c);
     });
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index f6a3587612cac9..d9ac536a4de23e 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -61,7 +61,8 @@ REGISTER_OP("EmptyTensorList")
     .Output("handle: variant")
     .Attr("element_dtype: type")
     .Attr("shape_type: {int32, int64}")
-    .SetTypeConstructor(full_type::Unary(TFT_ARRAY, "element_dtype"))
+    .SetTypeConstructor(full_type::UnaryTensorContainer(TFT_ARRAY,
+                                                        "element_dtype"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Scalar());
       DataType element_dtype;
@@ -81,7 +82,8 @@ REGISTER_OP("TensorListPushBack")
     .Input("tensor: element_dtype")
     .Output("output_handle: variant")
     .Attr("element_dtype: type")
-    .SetTypeConstructor(full_type::Unary(TFT_ARRAY, "element_dtype"))
+    .SetTypeConstructor(full_type::UnaryTensorContainer(TFT_ARRAY,
+                                                        "element_dtype"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Scalar());
       DataType element_dtype;
@@ -121,7 +123,8 @@ REGISTER_OP("TensorListPushBackBatch")
     .Output("output_handles: variant")
     .Attr("element_dtype: type")
     // TODO(mdan): Also support for inferring from an input type as well.
-    .SetTypeConstructor(full_type::Unary(TFT_ARRAY, "element_dtype"))
+    .SetTypeConstructor(full_type::UnaryTensorContainer(TFT_ARRAY,
+                                                        "element_dtype"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle input_handles;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input_handles));
@@ -332,7 +335,8 @@ REGISTER_OP("TensorListSplit")
     .Output("output_handle: variant")
     .Attr("element_dtype: type")
     .Attr("shape_type: {int32, int64}")
-    .SetTypeConstructor(full_type::Unary(TFT_ARRAY, "element_dtype"))
+    .SetTypeConstructor(full_type::UnaryTensorContainer(TFT_ARRAY,
+                                                        "element_dtype"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Scalar());
       DataType element_dtype;
@@ -368,7 +372,8 @@ REGISTER_OP("TensorListFromTensor")
     .Output("output_handle: variant")
     .Attr("element_dtype: type")
     .Attr("shape_type: {int32, int64}")
-    .SetTypeConstructor(full_type::Unary(TFT_ARRAY, "element_dtype"))
+    .SetTypeConstructor(full_type::UnaryTensorContainer(TFT_ARRAY,
+                                                        "element_dtype"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Scalar());
       DataType element_dtype;
@@ -416,7 +421,8 @@ REGISTER_OP("TensorListReserve")
     .Output("handle: variant")
     .Attr("element_dtype: type")
     .Attr("shape_type: {int32, int64}")
-    .SetTypeConstructor(full_type::Unary(TFT_ARRAY, "element_dtype"))
+    .SetTypeConstructor(full_type::UnaryTensorContainer(TFT_ARRAY,
+                                                        "element_dtype"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Scalar());
       shape_inference::ShapeHandle element_shape;
@@ -485,7 +491,8 @@ REGISTER_OP("TensorListSetItem")
     .Input("item: element_dtype")
     .Output("output_handle: variant")
     .Attr("element_dtype: type")
-    .SetTypeConstructor(full_type::Unary(TFT_ARRAY, "element_dtype"))
+    .SetTypeConstructor(full_type::UnaryTensorContainer(TFT_ARRAY,
+                                                        "element_dtype"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       DataType element_dtype;
       TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
@@ -547,7 +554,8 @@ REGISTER_OP("TensorListScatter")
     .Output("output_handle: variant")
     .Attr("element_dtype: type")
     .Attr("shape_type: {int32, int64}")
-    .SetTypeConstructor(full_type::Unary(TFT_ARRAY, "element_dtype"))
+    .SetTypeConstructor(full_type::UnaryTensorContainer(TFT_ARRAY,
+                                                        "element_dtype"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       DataType element_dtype;
       TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
@@ -570,7 +578,8 @@ REGISTER_OP("TensorListScatterV2")
     .Output("output_handle: variant")
     .Attr("element_dtype: type")
     .Attr("shape_type: {int32, int64}")
-    .SetTypeConstructor(full_type::Unary(TFT_ARRAY, "element_dtype"))
+    .SetTypeConstructor(full_type::UnaryTensorContainer(TFT_ARRAY,
+                                                        "element_dtype"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       DataType element_dtype;
       TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
@@ -591,7 +600,8 @@ REGISTER_OP("TensorListScatterIntoExistingList")
     .Input("indices: int32")
     .Output("output_handle: variant")
     .Attr("element_dtype: type")
-    .SetTypeConstructor(full_type::Unary(TFT_ARRAY, "element_dtype"))
+    .SetTypeConstructor(full_type::UnaryTensorContainer(TFT_ARRAY,
+                                                        "element_dtype"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       shape_inference::ShapeHandle ignored;
       // Check that tensor is at least a vector.
@@ -621,7 +631,8 @@ REGISTER_OP("TensorListConcatLists")
     .Input("input_b: variant")
     .Attr("element_dtype: type")
     .Output("output: variant")
-    .SetTypeConstructor(full_type::Unary(TFT_ARRAY, "element_dtype"))
+    .SetTypeConstructor(full_type::UnaryTensorContainer(TFT_ARRAY,
+                                                        "element_dtype"))
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       auto input_a = c->input(0);
       auto input_b = c->input(1);
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index cf03d96ec31c9e..da68192e403da6 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -2543,8 +2543,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -2577,8 +2590,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -3006,8 +3032,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -3627,8 +3666,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -3672,8 +3724,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -6846,8 +6911,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -6997,8 +7075,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -7073,8 +7164,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -7342,8 +7446,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -7387,8 +7504,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -7632,8 +7762,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -7686,8 +7829,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -8817,8 +8973,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -11755,8 +11924,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -11839,8 +12021,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -11923,8 +12118,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -13199,8 +13407,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -14006,8 +14227,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -14554,8 +14788,11 @@ op {
     experimental_full_type {
       type_id: TFT_ARRAY
       args {
-        type_id: TFT_VAR
-        s: "element_dtype"
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
       }
     }
   }
@@ -15477,8 +15714,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -15515,8 +15765,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -15556,8 +15819,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -15618,8 +15894,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -15659,8 +15948,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -15734,8 +16036,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -15769,8 +16084,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -15821,8 +16149,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -15900,8 +16241,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -15957,8 +16311,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -16006,8 +16373,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -16041,8 +16421,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -16087,8 +16480,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -16137,8 +16543,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -16215,8 +16634,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -16245,8 +16677,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -16299,8 +16744,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -16346,8 +16804,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -16426,8 +16897,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -16460,8 +16944,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -16495,8 +16992,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -16540,8 +17050,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -16604,8 +17127,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -16639,8 +17175,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -16681,8 +17230,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -16719,8 +17281,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -16788,8 +17363,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -16831,8 +17419,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -16899,8 +17500,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -16929,8 +17543,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -17722,8 +18349,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -17756,8 +18396,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -17802,8 +18455,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -18162,8 +18828,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -19618,8 +20297,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -19687,8 +20379,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -19862,8 +20567,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -19941,8 +20659,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -20620,8 +21351,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -21289,8 +22033,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -21907,8 +22664,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -22274,8 +23044,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -22481,8 +23264,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -22757,8 +23553,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -23936,8 +24745,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -24031,8 +24853,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -25024,6 +25859,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_DOUBLE
         type: DT_FLOAT
         type: DT_HALF
@@ -25109,8 +25945,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -26466,8 +27315,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -27690,8 +28552,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -27913,8 +28788,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -27963,8 +28851,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -28066,8 +28967,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -28667,8 +29581,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "Toutput_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "Toutput_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "Toutput_types"
+        }
       }
     }
   }
@@ -28727,8 +29654,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "Toutput_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "Toutput_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "Toutput_types"
+        }
       }
     }
   }
@@ -28878,8 +29818,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -29006,8 +29959,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -29068,8 +30034,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -29137,8 +30116,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -29214,8 +30206,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -29275,8 +30280,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -29350,8 +30368,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -29571,8 +30602,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -29695,8 +30739,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -30858,8 +31915,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -31157,8 +32227,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -36529,8 +37612,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -37128,8 +38224,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -37523,8 +38632,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -37568,8 +38690,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -38272,8 +39407,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -44656,8 +45804,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -44998,8 +46159,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -46738,8 +47912,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -46841,8 +48028,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -46935,8 +48135,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -46999,8 +48212,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -47056,8 +48282,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -47108,8 +48347,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -47162,8 +48414,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -47376,8 +48641,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -47482,8 +48760,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -47557,8 +48848,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -47613,8 +48917,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -47745,8 +49062,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -47798,8 +49128,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -47889,8 +49232,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -51797,8 +53153,11 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "Tvalues"
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "Tvalues"
+        }
       }
     }
   }
@@ -52060,8 +53419,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -55497,8 +56869,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -55576,8 +56961,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -56695,8 +58093,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "Toutput_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "Toutput_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "Toutput_types"
+        }
       }
     }
   }
@@ -56765,8 +58176,11 @@ op {
     experimental_full_type {
       type_id: TFT_ARRAY
       args {
-        type_id: TFT_VAR
-        s: "element_dtype"
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
       }
     }
   }
@@ -56849,8 +58263,11 @@ op {
     experimental_full_type {
       type_id: TFT_ARRAY
       args {
-        type_id: TFT_VAR
-        s: "element_dtype"
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
       }
     }
   }
@@ -56965,8 +58382,11 @@ op {
     experimental_full_type {
       type_id: TFT_ARRAY
       args {
-        type_id: TFT_VAR
-        s: "element_dtype"
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
       }
     }
   }
@@ -56991,8 +58411,11 @@ op {
     experimental_full_type {
       type_id: TFT_ARRAY
       args {
-        type_id: TFT_VAR
-        s: "element_dtype"
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
       }
     }
   }
@@ -57017,8 +58440,11 @@ op {
     experimental_full_type {
       type_id: TFT_ARRAY
       args {
-        type_id: TFT_VAR
-        s: "element_dtype"
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
       }
     }
   }
@@ -57072,8 +58498,11 @@ op {
     experimental_full_type {
       type_id: TFT_ARRAY
       args {
-        type_id: TFT_VAR
-        s: "element_dtype"
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
       }
     }
   }
@@ -57112,8 +58541,11 @@ op {
     experimental_full_type {
       type_id: TFT_ARRAY
       args {
-        type_id: TFT_VAR
-        s: "element_dtype"
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
       }
     }
   }
@@ -57146,8 +58578,11 @@ op {
     experimental_full_type {
       type_id: TFT_ARRAY
       args {
-        type_id: TFT_VAR
-        s: "element_dtype"
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
       }
     }
   }
@@ -57186,8 +58621,11 @@ op {
     experimental_full_type {
       type_id: TFT_ARRAY
       args {
-        type_id: TFT_VAR
-        s: "element_dtype"
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
       }
     }
   }
@@ -57216,8 +58654,11 @@ op {
     experimental_full_type {
       type_id: TFT_ARRAY
       args {
-        type_id: TFT_VAR
-        s: "element_dtype"
+        type_id: TFT_TENSOR
+        args {
+          type_id: TFT_VAR
+          s: "element_dtype"
+        }
       }
     }
   }
@@ -57557,8 +58998,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "Toutput_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "Toutput_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "Toutput_types"
+        }
       }
     }
   }
@@ -57841,8 +59295,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -58441,8 +59908,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -58879,8 +60359,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -59782,8 +61275,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -59819,8 +61325,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
@@ -60412,8 +61931,21 @@ op {
     experimental_full_type {
       type_id: TFT_DATASET
       args {
-        type_id: TFT_VAR
-        s: "output_types"
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
       }
     }
   }
diff --git a/tensorflow/core/platform/default/experimental_cc_shared_library.bzl b/tensorflow/core/platform/default/experimental_cc_shared_library.bzl
index 03294330a13328..3c66a64450334b 100644
--- a/tensorflow/core/platform/default/experimental_cc_shared_library.bzl
+++ b/tensorflow/core/platform/default/experimental_cc_shared_library.bzl
@@ -8,6 +8,11 @@ rely on this. It requires bazel >1.2  and passing the flag
 # TODO(rostam): Delete this module after the release of Bazel built-in cc_shared_library.
 
 load("@bazel_skylib//rules:common_settings.bzl", "BuildSettingInfo")
+load(
+    "@rules_cc//examples:experimental_cc_shared_library.bzl",
+    "CcSharedLibraryInfo",
+    "CcSharedLibraryPermissionsInfo",
+)
 
 def find_cc_toolchain(ctx):
     """Returns the current `CcToolchainInfo`.
@@ -42,12 +47,6 @@ def find_cc_toolchain(ctx):
 # used sparingly after making sure it's safe to use.
 LINKABLE_MORE_THAN_ONCE = "LINKABLE_MORE_THAN_ONCE"
 
-CcSharedLibraryPermissionsInfo = provider(
-    "Permissions for a cc shared library.",
-    fields = {
-        "targets": "Matches targets that can be exported.",
-    },
-)
 GraphNodeInfo = provider(
     "Nodes in the graph of shared libraries.",
     fields = {
@@ -56,21 +55,6 @@ GraphNodeInfo = provider(
         "linkable_more_than_once": "Linkable into more than a single cc_shared_library",
     },
 )
-CcSharedLibraryInfo = provider(
-    "Information about a cc shared library.",
-    fields = {
-        "dynamic_deps": "All shared libraries depended on transitively",
-        "exports": "cc_libraries that are linked statically and exported",
-        "link_once_static_libs": "All libraries linked statically into this library that should " +
-                                 "only be linked once, e.g. because they have static " +
-                                 "initializers. If we try to link them more than once, " +
-                                 "we will throw an error",
-        "linker_input": "the resulting linker input artifact for the shared library",
-        "preloaded_deps": "cc_libraries needed by this cc_shared_library that should" +
-                          " be linked the binary. If this is set, this cc_shared_library has to " +
-                          " be a direct dependency of the cc_binary",
-    },
-)
 
 def _separate_static_and_dynamic_link_libraries(
         direct_children,
diff --git a/tensorflow/core/platform/refcount_test.cc b/tensorflow/core/platform/refcount_test.cc
index 762645e838b4f8..48a821167c1739 100644
--- a/tensorflow/core/platform/refcount_test.cc
+++ b/tensorflow/core/platform/refcount_test.cc
@@ -157,8 +157,12 @@ TEST(WeakPtr, MultiThreadedWeakRef) {
 
     EXPECT_EQ(weakptr.GetNewRef(), nullptr);
   }
-  ASSERT_GT(hit_destructed, 0);
-  ASSERT_LT(hit_destructed, 200);  // 2 threads per iterations.
+  if (hit_destructed == 0) {
+    LOG(WARNING) << "The destructed weakref test branch is not exercised.";
+  }
+  if (hit_destructed == 200) {
+    LOG(WARNING) << "The valid weakref test branch is not exercised.";
+  }
 }
 
 TEST(WeakPtr, NotifyCalled) {
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 30d09ce339b15e..42c4dc23d8106d 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 965  // Updated: 2021/11/29
+#define TF_GRAPH_DEF_VERSION 972  // Updated: 2021/12/6
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/runtime_fallback/kernel/BUILD b/tensorflow/core/runtime_fallback/kernel/BUILD
index b1d0df54c872a7..dd4d9b515999c8 100644
--- a/tensorflow/core/runtime_fallback/kernel/BUILD
+++ b/tensorflow/core/runtime_fallback/kernel/BUILD
@@ -389,36 +389,20 @@ cc_library(
         "//tensorflow/core/tfrt/eager:__pkg__",
     ],
     deps = [
-        ":kernel_fallback_compat_request_state",
-        ":kernel_fallback_tensor",
-        ":kernel_fallback_tensor_conversion_alwayslink",
-        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/meta:type_traits",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
-        "//tensorflow/core/runtime_fallback/util:attr_util",
-        "//tensorflow/core/tfrt/utils:fallback_tensor",
-        "//tensorflow/core/tfrt/utils:statusor",
-        "@tf_runtime//:core_runtime",
-        "@tf_runtime//:hostcontext",
-        "@tf_runtime//:support",
-        "@tf_runtime//:tensor_alwayslink",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
-            "//tensorflow/core/common_runtime/eager:attr_builder",
-            "//tensorflow/core/common_runtime/eager:core",
+            "//tensorflow/core/common_runtime:device_mgr",
+            "//tensorflow/core/common_runtime:function",
             "//tensorflow/core:framework",
-            "//tensorflow/core:framework_lite",
             "//tensorflow/core/framework:node_def_proto_cc",
             "//tensorflow/core/framework:op_def_proto_cc",
             "//tensorflow/core/framework:tensor",
             "//tensorflow/core/platform:errors",
             "//tensorflow/core/platform:status",
-            "//tensorflow/core:core_cpu_base",
         ],
     }),
 )
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h
index bea80efd59037d..87069e5b6ab020 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h
@@ -33,7 +33,6 @@ limitations under the License.
 namespace tensorflow {
 namespace tfd {
 
-class OpKernelRunnerCache;
 class OpKernelRunnerTable;
 
 // FallbackResourceArray holds the tensors that are computed only once during
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
index 1dc5e356ecec64..7f6de55c3ef006 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_tensor.h"
 #include "tensorflow/core/runtime_fallback/kernel/op_kernel_runner.h"
 #include "tensorflow/core/runtime_fallback/runtime/kernel_utils.h"
 #include "tensorflow/core/runtime_fallback/runtime/op_logger.h"
@@ -213,62 +214,27 @@ static Status ValidateInputTypes(
 
 namespace {
 
-// OpKernelRunState keeps the states needed for per-kernel execution.
-struct OpKernelRunState {
-  gtl::InlinedVector<tensorflow::Tensor, 4> input_tf_tensors;
-  gtl::InlinedVector<tensorflow::TensorValue, 4> input_tf_tensor_values;
-  OpKernelContext::Params params;
-
-  OpKernelRunState() = default;
-  OpKernelRunState(
-      const gtl::InlinedVector<tensorflow::TensorValue, 4>& tensor_values,
-      const OpKernelContext::Params& p) {
-    // `input_tf_tensor_values` contains the reference to all tensor used,
-    // while `input_tf_tensors` only contains those needs ownership so their
-    // sizes may not match. For this copy assignment, we conservatively copy all
-    // tensors.
-    input_tf_tensors.reserve(tensor_values.size());
-    for (const auto& tensor_value : tensor_values) {
-      input_tf_tensors.push_back(*tensor_value.tensor);
-    }
-    for (auto& tensor : input_tf_tensors) {
-      input_tf_tensor_values.emplace_back(&tensor);
-    }
-
-    // Since `input_tf_tensor_values` and `params` contains pointers to
-    // `input_tf_tensors`, we need to change those pointers to the correct ones
-    // after copying.
-    params = p;
-    params.inputs = &input_tf_tensor_values;
-  }
-
-  OpKernelRunState(const OpKernelRunState& other) = delete;
-  OpKernelRunState& operator=(const OpKernelRunState& other) = delete;
-
-  ~OpKernelRunState() = default;
-
-  void SetUpParams(
-      const OpKernelRunner& runner,
-      const KernelFallbackCompatRequestState& fallback_request_state,
-      tensorflow::Device* device) {
-    params.inputs = &input_tf_tensor_values;
-    params.device = device;
-    params.op_kernel = runner.op_kernel();
-    // Still use original device's resource_manager.
-    params.resource_manager = runner.resource_manager();
-    params.input_alloc_attrs = &runner.input_alloc_attrs();
-    params.output_attr_array = runner.output_alloc_attrs().data();
-    params.step_container = fallback_request_state.step_container();
-    // Following two parameters are used to support executing tf.data via
-    // fallback.
-    params.function_library = runner.function_library_runtime();
-    params.runner = fallback_request_state.runner();
-    params.collective_executor = fallback_request_state.collective_executor();
-    params.rendezvous = fallback_request_state.rendezvous();
-    params.session_metadata = &fallback_request_state.session_metadata();
-    params.cancellation_manager = fallback_request_state.cancellation_manager();
-  }
-};
+void SetUpParams(const OpKernelRunner& runner,
+                 const KernelFallbackCompatRequestState& fallback_request_state,
+                 tensorflow::Device* device, OpKernelRunState& run_state) {
+  auto& params = run_state.params;
+  params.inputs = &run_state.input_tf_tensor_values;
+  params.device = device;
+  params.op_kernel = runner.op_kernel();
+  // Still use original device's resource_manager.
+  params.resource_manager = runner.resource_manager();
+  params.input_alloc_attrs = &runner.input_alloc_attrs();
+  params.output_attr_array = runner.output_alloc_attrs().data();
+  params.step_container = fallback_request_state.step_container();
+  // Following two parameters are used to support executing tf.data via
+  // fallback.
+  params.function_library = runner.function_library_runtime();
+  params.runner = fallback_request_state.runner();
+  params.collective_executor = fallback_request_state.collective_executor();
+  params.rendezvous = fallback_request_state.rendezvous();
+  params.session_metadata = &fallback_request_state.session_metadata();
+  params.cancellation_manager = fallback_request_state.cancellation_manager();
+}
 
 // Keep states needed by kernel execution in a thread local storage to avoid
 // repeated reallocation and destruction of them.
@@ -432,7 +398,7 @@ tfrt::AsyncValueRef<tfrt::Chain> KernelFallbackExecuteCompatCoreRuntimeDispatch(
   auto* device =
       GetDeviceFromFallbackState(fallback_request_state, op_kernel_runner);
 
-  run_state.SetUpParams(op_kernel_runner, fallback_request_state, device);
+  SetUpParams(op_kernel_runner, fallback_request_state, device, run_state);
 
   if (op_kernel_runner.IsAsync()) {
     KernelFallbackExecuteCompatAsyncInternal<KernelFallbackTensor>(
@@ -582,7 +548,7 @@ TF_ATTRIBUTE_ALWAYS_INLINE static void KernelFallbackExecuteOpInternal(
     input_tf_tensor_values[i].tensor = &fallback_tensor.tensor();
   }
 
-  run_state.SetUpParams(kernel_runner, fallback_request_state, device);
+  SetUpParams(kernel_runner, fallback_request_state, device, run_state);
 
   if (is_async) {
     KernelFallbackExecuteCompatAsyncInternal<
@@ -646,21 +612,17 @@ llvm::Expected<tfrt::Chain> KernelFallbackCreateOp(
   auto* runner_table = fallback_request_state->runner_table();
   DCHECK(runner_table);
 
-  auto attr_builder =
-      [op_attr_array, op_func_attr_array](
-          tensorflow::AttrValueMap* attr_value_map) -> llvm::Error {
-    auto status =
-        SetUpAttrValueMap(op_attr_array, op_func_attr_array, attr_value_map);
-
-    if (!status.ok()) return tfrt::MakeStringError(status.error_message());
-    return llvm::Error::success();
+  auto attr_builder = [op_attr_array, op_func_attr_array](
+                          tensorflow::AttrValueMap* attr_value_map) {
+    return SetUpAttrValueMap(op_attr_array, op_func_attr_array, attr_value_map);
   };
 
   auto op_name = StripTfPrefix(op_name_attr.GetValue());
 
   auto statusor_runner = OpKernelRunner::Create(
       op_name, ToAbslStringView(device.GetValue()), num_args.GetValue(),
-      attr_builder, *fallback_request_state);
+      attr_builder, fallback_request_state->device_manager(),
+      fallback_request_state->process_function_library_runtime());
   if (!statusor_runner.ok())
     return tfrt::MakeStatusError(statusor_runner.status());
 
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.cc
index 9c45f0398b2690..ca99f9478ad482 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.cc
@@ -146,6 +146,88 @@ struct KernelFallbackOpHandlerCompatTraits {
   }
 };
 
+class OpLocationKey {
+ public:
+  explicit OpLocationKey(tfrt::Location loc) : loc_(loc) {}
+
+  template <typename H>
+  friend H AbslHashValue(H h, const OpLocationKey& key) {
+    // NOTE: Each BEF file has its own LocationHandler. Using LocationHandler
+    // as part of cache key here can avoid cache collision between different
+    // BEF file.
+    return H::combine(std::move(h), key.loc_.data, key.loc_.GetHandler());
+  }
+
+  friend bool operator==(const OpLocationKey& x, const OpLocationKey& y) {
+    return x.loc_.data == y.loc_.data &&
+           x.loc_.GetHandler() == y.loc_.GetHandler();
+  }
+
+ private:
+  tfrt::Location loc_;
+};
+
+// OpKernelRunnerCache is similar to OpKernelRunnerTable but thread-safe.
+class OpKernelRunnerCache {
+ public:
+  OpKernelRunnerCache() = default;
+
+  StatusOr<OpKernelRunner*> GetOrCreate(
+      tfrt::Location loc, absl::string_view op_name,
+      absl::string_view device_name, int num_args,
+      const std::function<Status(tensorflow::AttrValueMap*)>& attr_builder,
+      const tensorflow::DeviceMgr& device_manager,
+      const tensorflow::ProcessFunctionLibraryRuntime&
+          process_function_library_runtime);
+
+ private:
+  mutable mutex mu_;
+  absl::flat_hash_map<OpLocationKey, std::unique_ptr<OpKernelRunner>> map_
+      TF_GUARDED_BY(mu_);
+};
+
+StatusOr<OpKernelRunner*> OpKernelRunnerCache::GetOrCreate(
+    tfrt::Location loc, absl::string_view op_name,
+    absl::string_view device_name, int num_args,
+    const std::function<Status(tensorflow::AttrValueMap*)>& attr_builder,
+    const tensorflow::DeviceMgr& device_manager,
+    const tensorflow::ProcessFunctionLibraryRuntime&
+        process_function_library_runtime) {
+  OpLocationKey key(loc);
+  {
+    tf_shared_lock lock(mu_);
+    auto it = map_.find(key);
+    if (it != map_.end()) {
+      DCHECK_EQ(it->second->op_kernel()->name(), op_name);
+      return it->second.get();
+    }
+  }
+
+  mutex_lock lock(mu_);
+
+  auto it = map_.find(key);
+  if (it != map_.end()) {
+    DCHECK_EQ(it->second->op_kernel()->name(), op_name);
+    return it->second.get();
+  }
+
+  VLOG(1) << "KernelFallbackExecuteCompat creating op " << op_name
+          << " at location " << loc.data << " on device " << device_name;
+
+  TF_ASSIGN_OR_RETURN(
+      auto runner,
+      OpKernelRunner::Create(op_name, device_name, num_args, attr_builder,
+                             device_manager, process_function_library_runtime));
+
+  auto runner_uptr = std::make_unique<OpKernelRunner>(std::move(runner));
+
+  auto* runner_ptr = runner_uptr.get();
+  auto r = map_.emplace(key, std::move(runner_uptr)).second;
+  DCHECK(r);
+
+  return runner_ptr;
+}
+
 }  // namespace
 
 Expected<CoreRuntimeOp> KernelFallbackOpHandler::MakeOp(string_view op_name) {
@@ -205,10 +287,15 @@ Expected<CoreRuntimeOp> KernelFallbackOpHandler::MakeOp(string_view op_name) {
             ToAbslStringView(fallback_op_entry.op_name),
             ToAbslStringView(device()->name()), invocation.arguments.size(),
             [&attrs = invocation.attrs, host = invocation.exec_ctx.host()](
-                tensorflow::AttrValueMap* attr_value_map) -> llvm::Error {
-              return tfd::FillAttrValueMap(attrs, host, attr_value_map);
+                tensorflow::AttrValueMap* attr_value_map) {
+              if (auto error =
+                      tfd::FillAttrValueMap(attrs, host, attr_value_map))
+                return tensorflow::errors::InvalidArgument(tfrt::StrCat(error));
+              return Status::OK();
             },
-            *fallback_op_entry.fallback_request_state);
+            fallback_op_entry.fallback_request_state->device_manager(),
+            fallback_op_entry.fallback_request_state
+                ->process_function_library_runtime());
 
         if (!kernel_runner_or_status.ok()) {
           propagate_error(kernel_runner_or_status.status());
diff --git a/tensorflow/core/runtime_fallback/kernel/op_kernel_runner.cc b/tensorflow/core/runtime_fallback/kernel/op_kernel_runner.cc
index 0118afb6a8b75a..fcec23d5f5f59d 100644
--- a/tensorflow/core/runtime_fallback/kernel/op_kernel_runner.cc
+++ b/tensorflow/core/runtime_fallback/kernel/op_kernel_runner.cc
@@ -20,27 +20,27 @@ namespace tensorflow {
 namespace tfd {
 namespace {
 
-llvm::Error CheckOpDefCompatibility(const tensorflow::OpDef& op_def) {
-  auto check_arg_def = [&](const auto& arg_def) -> llvm::Error {
+Status CheckOpDefCompatibility(const tensorflow::OpDef& op_def) {
+  auto check_arg_def = [&](const auto& arg_def) {
     if (arg_def.is_ref())
-      return tfrt::MakeStringError(
+      return tensorflow::errors::Internal(
           "TFRT kernel fallback error: Unsupported ref args in ",
           op_def.name());
-    return llvm::Error::success();
+    return Status::OK();
   };
 
   for (const auto& arg_def : op_def.input_arg())
-    if (auto error = check_arg_def(arg_def)) return error;
+    TF_RETURN_IF_ERROR(check_arg_def(arg_def));
   for (const auto& arg_def : op_def.output_arg())
-    if (auto error = check_arg_def(arg_def)) return error;
+    TF_RETURN_IF_ERROR(check_arg_def(arg_def));
 
-  return llvm::Error::success();
+  return Status::OK();
 }
 
 // Create a tensorflow::NodeDef from the tensorflow::OpDef and the attributes.
-tfrt::StatusOr<tensorflow::NodeDef> BuildNodeDef(
+StatusOr<tensorflow::NodeDef> BuildNodeDef(
     const tensorflow::OpDef& op_def, int num_args,
-    const std::function<llvm::Error(tensorflow::AttrValueMap*)>& attr_builder) {
+    const std::function<Status(tensorflow::AttrValueMap*)>& attr_builder) {
   tensorflow::NodeDef node_def;
   node_def.set_name(op_def.name());
   node_def.set_op(op_def.name());
@@ -49,9 +49,7 @@ tfrt::StatusOr<tensorflow::NodeDef> BuildNodeDef(
   }
 
   auto* attr_value_map = node_def.mutable_attr();
-  if (auto error = attr_builder(attr_value_map)) {
-    return tensorflow::errors::InvalidArgument(tfrt::StrCat(error));
-  }
+  TF_RETURN_IF_ERROR(attr_builder(attr_value_map));
 
   // For any attr-value pairs that exist in the op def (from op registry)
   // but not in `attr_value_map`, fill them into `attr_value_map`, so that we
@@ -80,15 +78,16 @@ tensorflow::Status CreateOpKernel(
 
 }  // namespace
 
-tfrt::StatusOr<OpKernelRunner> OpKernelRunner::Create(
+StatusOr<OpKernelRunner> OpKernelRunner::Create(
     absl::string_view op_name, absl::string_view device_name, int num_args,
-    const std::function<llvm::Error(tensorflow::AttrValueMap*)>& attr_builder,
-    const KernelFallbackCompatRequestState& fallback_request_state) {
+    const std::function<Status(tensorflow::AttrValueMap*)>& attr_builder,
+    const tensorflow::DeviceMgr& device_manager,
+    const tensorflow::ProcessFunctionLibraryRuntime&
+        process_function_library_runtime) {
   const OpDef* op_def = nullptr;
-  TF_RETURN_IF_ERROR(tensorflow::OpDefForOp(std::string(op_name), &op_def));
-  if (auto error = CheckOpDefCompatibility(*op_def)) {
-    return tensorflow::errors::Internal(tfrt::StrCat(error));
-  }
+  TF_RETURN_IF_ERROR(tensorflow::OpRegistry::Global()->LookUpOpDef(
+      std::string(op_name), &op_def));
+  TF_RETURN_IF_ERROR(CheckOpDefCompatibility(*op_def));
   VLOG(1) << "KernelFallbackExecuteCompat creating op from OpDef: "
           << op_def->DebugString();
 
@@ -105,20 +104,18 @@ tfrt::StatusOr<OpKernelRunner> OpKernelRunner::Create(
   // handle it specially. This is a workaround as the compiler lowering does not
   // use tensorflow format in some cases. Ideally, we should always use device
   // name in tensorflow format in fallback code.
-  Status s = fallback_request_state.device_manager().LookupDevice(device_name,
-                                                                  &device);
+  Status s = device_manager.LookupDevice(device_name, &device);
 
   // Fall back to host device if it fails to find the specified device.
   if (!s.ok()) {
     LOG(ERROR) << "Failed to find device " << device_name
                << " when creating OpKernel: " << op_name << ". Error: " << s;
     LOG(ERROR) << "Fallback to host device instead";
-    device = fallback_request_state.device_manager().HostCPU();
+    device = device_manager.HostCPU();
   }
 
   function_library_runtime =
-      fallback_request_state.process_function_library_runtime().GetFLR(
-          device->name());
+      process_function_library_runtime.GetFLR(device->name());
 
   std::unique_ptr<OpKernel> op_kernel;
   TF_RETURN_IF_ERROR(CreateOpKernel(function_library_runtime,
@@ -164,46 +161,5 @@ void OpKernelRunner::RunAsync(OpKernelContext* context,
   async->ComputeAsync(context, std::move(done_callback));
 }
 
-OpKernelRunnerCache::OpKernelRunnerCache() {}
-
-tfrt::StatusOr<OpKernelRunner*> OpKernelRunnerCache::GetOrCreate(
-    tfrt::Location loc, absl::string_view op_name,
-    absl::string_view device_name, int num_args,
-    const std::function<llvm::Error(tensorflow::AttrValueMap*)>& attr_builder,
-    const KernelFallbackCompatRequestState& fallback_request_state) {
-  OpLocationKey key(loc);
-  {
-    tf_shared_lock lock(mu_);
-    auto it = map_.find(key);
-    if (it != map_.end()) {
-      DCHECK_EQ(it->second->op_kernel()->name(), op_name);
-      return it->second.get();
-    }
-  }
-
-  mutex_lock lock(mu_);
-
-  auto it = map_.find(key);
-  if (it != map_.end()) {
-    DCHECK_EQ(it->second->op_kernel()->name(), op_name);
-    return it->second.get();
-  }
-
-  VLOG(1) << "KernelFallbackExecuteCompat creating op " << op_name
-          << " at location " << loc.data << " on device " << device_name;
-
-  TF_ASSIGN_OR_RETURN(auto runner, OpKernelRunner::Create(
-                                       op_name, device_name, num_args,
-                                       attr_builder, fallback_request_state));
-
-  auto runner_uptr = std::make_unique<OpKernelRunner>(std::move(runner));
-
-  auto* runner_ptr = runner_uptr.get();
-  auto r = map_.emplace(key, std::move(runner_uptr)).second;
-  DCHECK(r);
-
-  return runner_ptr;
-}
-
 }  // namespace tfd
 }  // namespace tensorflow
diff --git a/tensorflow/core/runtime_fallback/kernel/op_kernel_runner.h b/tensorflow/core/runtime_fallback/kernel/op_kernel_runner.h
index 2914d77af60fa2..b6bae584168cec 100644
--- a/tensorflow/core/runtime_fallback/kernel/op_kernel_runner.h
+++ b/tensorflow/core/runtime_fallback/kernel/op_kernel_runner.h
@@ -20,55 +20,35 @@ limitations under the License.
 
 #include <memory>
 #include <string>
-#include <type_traits>
 #include <utility>
 
-#include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
-#include "absl/meta/type_traits.h"
-#include "absl/types/span.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "tensorflow/core/common_runtime/eager/attr_builder.h"
-#include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/device.h"
-#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/framework/node_properties.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
-#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_tensor.h"
-#include "tensorflow/core/runtime_fallback/util/attr_util.h"
-#include "tensorflow/core/tfrt/utils/statusor.h"
-#include "tfrt/core_runtime/op_attrs.h"  // from @tf_runtime
-#include "tfrt/host_context/async_dispatch.h"  // from @tf_runtime
-#include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
-#include "tfrt/host_context/chain.h"  // from @tf_runtime
-#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
-#include "tfrt/host_context/host_context.h"  // from @tf_runtime
-#include "tfrt/host_context/sync_kernel_frame.h"  // from @tf_runtime
-#include "tfrt/support/error_util.h"  // from @tf_runtime
-#include "tfrt/support/forward_decls.h"  // from @tf_runtime
-#include "tfrt/tensor/tensor.h"  // from @tf_runtime
 
 namespace tensorflow {
 namespace tfd {
 
 class OpKernelRunner {
  public:
-  static tfrt::StatusOr<OpKernelRunner> Create(
+  static StatusOr<OpKernelRunner> Create(
       absl::string_view op_name, absl::string_view device_name, int num_args,
-      const std::function<llvm::Error(tensorflow::AttrValueMap*)>& attr_builder,
-      const KernelFallbackCompatRequestState& fallback_request_state);
+      const std::function<Status(tensorflow::AttrValueMap*)>& attr_builder,
+      const tensorflow::DeviceMgr& device_manager,
+      const tensorflow::ProcessFunctionLibraryRuntime&
+          process_function_library_runtime);
+
+  OpKernelRunner() = default;
+
+  explicit operator bool() const { return op_kernel_ != nullptr; }
 
   void Run(OpKernelContext* context) const {
     DVLOG(1) << "KernelFallbackExecuteCompat Running Op: "
@@ -114,25 +94,39 @@ class OpKernelRunner {
   gtl::InlinedVector<AllocatorAttributes, 1> output_alloc_attrs_;
 };
 
-class OpLocationKey {
- public:
-  explicit OpLocationKey(tfrt::Location loc) : loc_(loc) {}
-
-  template <typename H>
-  friend H AbslHashValue(H h, const OpLocationKey& key) {
-    // NOTE: Each BEF file has its own LocationHandler. Using LocationHandler
-    // as part of cache key here can avoid cache collision between different
-    // BEF file.
-    return H::combine(std::move(h), key.loc_.data, key.loc_.GetHandler());
+// OpKernelRunState keeps the states needed for per-kernel execution.
+struct OpKernelRunState {
+  gtl::InlinedVector<tensorflow::Tensor, 4> input_tf_tensors;
+  gtl::InlinedVector<tensorflow::TensorValue, 4> input_tf_tensor_values;
+  OpKernelContext::Params params;
+
+  OpKernelRunState() = default;
+  OpKernelRunState(
+      const gtl::InlinedVector<tensorflow::TensorValue, 4>& tensor_values,
+      const OpKernelContext::Params& p) {
+    // `input_tf_tensor_values` contains the reference to all tensor used,
+    // while `input_tf_tensors` only contains those needs ownership so their
+    // sizes may not match. For this copy assignment, we conservatively copy all
+    // tensors.
+    input_tf_tensors.reserve(tensor_values.size());
+    for (const auto& tensor_value : tensor_values) {
+      input_tf_tensors.push_back(*tensor_value.tensor);
+    }
+    for (auto& tensor : input_tf_tensors) {
+      input_tf_tensor_values.emplace_back(&tensor);
+    }
+
+    // Since `input_tf_tensor_values` and `params` contains pointers to
+    // `input_tf_tensors`, we need to change those pointers to the correct ones
+    // after copying.
+    params = p;
+    params.inputs = &input_tf_tensor_values;
   }
 
-  friend bool operator==(const OpLocationKey& x, const OpLocationKey& y) {
-    return x.loc_.data == y.loc_.data &&
-           x.loc_.GetHandler() == y.loc_.GetHandler();
-  }
+  OpKernelRunState(const OpKernelRunState& other) = delete;
+  OpKernelRunState& operator=(const OpKernelRunState& other) = delete;
 
- private:
-  tfrt::Location loc_;
+  ~OpKernelRunState() = default;
 };
 
 // OpKernelRunnerTable for keeping OpKernelRunner instances to avoid expensive
@@ -171,23 +165,6 @@ class OpKernelRunnerTable {
   std::vector<absl::optional<OpKernelRunner>> runners_;
 };
 
-// OpKernelRunnerCache is similar to OpKernelRunnerTable but thread-safe.
-class OpKernelRunnerCache {
- public:
-  OpKernelRunnerCache();
-
-  tfrt::StatusOr<OpKernelRunner*> GetOrCreate(
-      tfrt::Location loc, absl::string_view op_name,
-      absl::string_view device_name, int num_args,
-      const std::function<llvm::Error(tensorflow::AttrValueMap*)>& attr_builder,
-      const KernelFallbackCompatRequestState& fallback_request_state);
-
- private:
-  mutable mutex mu_;
-  absl::flat_hash_map<OpLocationKey, std::unique_ptr<OpKernelRunner>> map_
-      TF_GUARDED_BY(mu_);
-};
-
 }  // namespace tfd
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.cc b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.cc
index bfdc777bbafe50..e2db39c80e9f11 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.cc
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.cc
@@ -28,12 +28,8 @@ namespace tf {
 RunHandlerThreadWorkQueue::RunHandlerThreadWorkQueue(const Options& options)
     : options_(options),
       quiescing_state_(std::make_unique<::tfrt::internal::QuiescingState>()),
-      // TODO(b/207109369): This work queue will be used for JIT compilation
-      // during model initialization. Until we figured out how to offload all
-      // compilation tasks to a separate work queue, initialize it large enough
-      // to compile multiple kernels concurrently.
       non_blocking_work_queue_(quiescing_state_.get(),
-                               /*num_threads=*/16),
+                               /*num_threads=*/1),
       blocking_work_queue_(quiescing_state_.get(),
                            /*num_threads=*/1) {
   CHECK(options.num_threads_in_sub_thread_pool.size() ==  // Crash OK.
diff --git a/tensorflow/core/tfrt/saved_model/tests/BUILD b/tensorflow/core/tfrt/saved_model/tests/BUILD
index e6617129724d29..2f4ae8f7fc621c 100644
--- a/tensorflow/core/tfrt/saved_model/tests/BUILD
+++ b/tensorflow/core/tfrt/saved_model/tests/BUILD
@@ -30,9 +30,6 @@ pytype_strict_binary(
     srcs_version = "PY3",
     deps = [
         ":disable_tf2",  # build_cleaner: keep; go/disable_tf2
-        # google-internal: use_pure_python
-        "@absl_py//absl:app",
-        "@absl_py//absl/flags",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:math_ops",
@@ -48,6 +45,8 @@ pytype_strict_binary(
         "//tensorflow/python/saved_model:signature_def_utils",
         "//tensorflow/python/saved_model:tag_constants",
         "//tensorflow/python/saved_model:utils",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
     ],
 )
 
@@ -58,9 +57,6 @@ pytype_strict_binary(
     srcs_version = "PY3",
     deps = [
         ":disable_tf2",  # build_cleaner: keep; go/disable_tf2
-        # google-internal: use_pure_python
-        "@absl_py//absl:app",
-        "@absl_py//absl/flags",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:session",
@@ -70,6 +66,8 @@ pytype_strict_binary(
         "//tensorflow/python/saved_model:signature_def_utils",
         "//tensorflow/python/saved_model:tag_constants",
         "//tensorflow/python/saved_model:utils",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
     ],
 )
 
@@ -79,9 +77,6 @@ pytype_strict_binary(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        # google-internal: use_pure_python
-        "@absl_py//absl:app",
-        "@absl_py//absl/flags",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:math_ops",
@@ -92,6 +87,8 @@ pytype_strict_binary(
         "//tensorflow/python/saved_model:signature_def_utils",
         "//tensorflow/python/saved_model:tag_constants",
         "//tensorflow/python/saved_model:utils",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
     ],
 )
 
@@ -101,9 +98,6 @@ pytype_strict_binary(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        # google-internal: use_pure_python
-        "@absl_py//absl:app",
-        "@absl_py//absl/flags",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:math_ops",
@@ -115,6 +109,8 @@ pytype_strict_binary(
         "//tensorflow/python/saved_model:signature_def_utils",
         "//tensorflow/python/saved_model:tag_constants",
         "//tensorflow/python/saved_model:utils",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
     ],
 )
 
@@ -124,9 +120,6 @@ pytype_strict_binary(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        # google-internal: use_pure_python
-        "@absl_py//absl:app",
-        "@absl_py//absl/flags",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:math_ops",
@@ -139,6 +132,8 @@ pytype_strict_binary(
         "//tensorflow/python/saved_model:signature_def_utils",
         "//tensorflow/python/saved_model:tag_constants",
         "//tensorflow/python/saved_model:utils",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
     ],
 )
 
@@ -148,10 +143,6 @@ pytype_strict_binary(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        # google-internal: use_pure_python
-        "@absl_py//absl:app",
-        "@absl_py//absl/flags",
-        "@absl_py//absl/logging",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_spec",
@@ -161,6 +152,9 @@ pytype_strict_binary(
         "//tensorflow/python/module",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:save_options",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
+        "@absl_py//absl/logging",
     ],
 )
 
@@ -170,10 +164,6 @@ pytype_strict_binary(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        # google-internal: use_pure_python
-        "@absl_py//absl:app",
-        "@absl_py//absl/flags",
-        "@absl_py//absl/logging",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:tensor_spec",
         "//tensorflow/python/compat:v2_compat",
@@ -182,6 +172,9 @@ pytype_strict_binary(
         "//tensorflow/python/module",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:save_options",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
+        "@absl_py//absl/logging",
     ],
 )
 
@@ -192,9 +185,6 @@ pytype_strict_binary(
     srcs_version = "PY3",
     deps = [
         ":disable_tf2",  # build_cleaner: keep; go/disable_tf2
-        # google-internal: use_pure_python
-        "@absl_py//absl:app",
-        "@absl_py//absl/flags",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:math_ops",
@@ -212,6 +202,8 @@ pytype_strict_binary(
         "//tensorflow/python/saved_model:signature_def_utils",
         "//tensorflow/python/saved_model:tag_constants",
         "//tensorflow/python/saved_model:utils",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
     ],
 )
 
@@ -222,9 +214,6 @@ pytype_strict_binary(
     srcs_version = "PY3",
     deps = [
         ":disable_tf2",  # build_cleaner: keep; go/disable_tf2
-        # google-internal: use_pure_python
-        "@absl_py//absl:app",
-        "@absl_py//absl/flags",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:math_ops",
@@ -240,6 +229,8 @@ pytype_strict_binary(
         "//tensorflow/python/saved_model:signature_def_utils",
         "//tensorflow/python/saved_model:tag_constants",
         "//tensorflow/python/saved_model:utils",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
     ],
 )
 
@@ -249,10 +240,6 @@ pytype_strict_binary(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        # google-internal: use_pure_python
-        "@absl_py//absl:app",
-        "@absl_py//absl/flags",
-        "@absl_py//absl/logging",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:math_ops",
@@ -264,6 +251,9 @@ pytype_strict_binary(
         "//tensorflow/python/module",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:save_options",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
+        "@absl_py//absl/logging",
     ],
 )
 
@@ -274,9 +264,6 @@ pytype_strict_binary(
     srcs_version = "PY3",
     deps = [
         ":disable_tf2",  # build_cleaner: keep; go/disable_tf2
-        # google-internal: use_pure_python
-        "@absl_py//absl:app",
-        "@absl_py//absl/flags",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
@@ -293,6 +280,8 @@ pytype_strict_binary(
         "//tensorflow/python/saved_model:signature_def_utils",
         "//tensorflow/python/saved_model:tag_constants",
         "//tensorflow/python/saved_model:utils",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
     ],
 )
 
@@ -331,9 +320,6 @@ pytype_strict_binary(
     srcs_version = "PY3",
     deps = [
         ":disable_tf2",  # build_cleaner: keep; go/disable_tf2
-        # google-internal: use_pure_python
-        "@absl_py//absl:app",
-        "@absl_py//absl/flags",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
@@ -351,6 +337,8 @@ pytype_strict_binary(
         "//tensorflow/python/saved_model:signature_def_utils",
         "//tensorflow/python/saved_model:tag_constants",
         "//tensorflow/python/saved_model:utils",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
     ],
 )
 
@@ -361,9 +349,6 @@ pytype_strict_binary(
     srcs_version = "PY3",
     deps = [
         ":disable_tf2",  # build_cleaner: keep; go/disable_tf2
-        # google-internal: use_pure_python
-        "@absl_py//absl:app",
-        "@absl_py//absl/flags",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
@@ -381,6 +366,8 @@ pytype_strict_binary(
         "//tensorflow/python/saved_model:signature_def_utils",
         "//tensorflow/python/saved_model:tag_constants",
         "//tensorflow/python/saved_model:utils",
+        "@absl_py//absl:app",
+        "@absl_py//absl/flags",
     ],
 )
 
diff --git a/tensorflow/core/tpu/tpu_executor_init_fns.inc b/tensorflow/core/tpu/tpu_executor_init_fns.inc
index 811db72befb8f8..d71238bf782355 100644
--- a/tensorflow/core/tpu/tpu_executor_init_fns.inc
+++ b/tensorflow/core/tpu/tpu_executor_init_fns.inc
@@ -160,6 +160,9 @@ tensorflow::Status SetExecutorStructFn(void* library_handle) {
   TFTPU_SET_FN(executor_fn, XlaShapeToTpuShapeRepresentation);
   TFTPU_SET_FN(executor_fn, XlaShapeToTpuPaddedShape);
 
+  TFTPU_SET_FN(executor_fn, TpuAsyncCollectiveOffloadHelper_Init);
+  TFTPU_SET_FN(executor_fn, TpuAsyncCollectiveOffloadHelper_Shutdown);
+
   return tensorflow::Status::OK();
 }
 
diff --git a/tensorflow/core/transforms/BUILD b/tensorflow/core/transforms/BUILD
index 706546a3dbbab6..83665b2dedb8d3 100644
--- a/tensorflow/core/transforms/BUILD
+++ b/tensorflow/core/transforms/BUILD
@@ -5,7 +5,10 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
 package(
     default_compatible_with = get_compatible_with_cloud(),
-    default_visibility = [":__subpackages__"],
+    default_visibility = [
+        ":__subpackages__",
+        "//tensorflow/compiler/mlir/tensorflow:__subpackages__",
+    ],
     licenses = ["notice"],  # Apache 2.0
 )
 
diff --git a/tensorflow/core/transforms/toposort/toposort_pass.cc b/tensorflow/core/transforms/toposort/toposort_pass.cc
index 8f760ad2a16c62..8d820c7c42d62e 100644
--- a/tensorflow/core/transforms/toposort/toposort_pass.cc
+++ b/tensorflow/core/transforms/toposort/toposort_pass.cc
@@ -28,6 +28,8 @@ namespace {
 #define GEN_PASS_CLASSES
 #include "tensorflow/core/transforms/passes.h.inc"
 
+}  // end namespace
+
 void SortTopologically(Block *block) {
   if (block->empty() || llvm::hasSingleElement(*block)) return;
 
@@ -92,7 +94,6 @@ struct TopoSortPass : TopoSortBase<TopoSortPass> {
   }
 };
 
-}  // end namespace
 }  // namespace tfg
 }  // namespace mlir
 
diff --git a/tensorflow/core/util/autotune_maps/autotune_maps_utils.cc b/tensorflow/core/util/autotune_maps/autotune_maps_utils.cc
index dbe05f067b8983..4eb4d44af71bc7 100644
--- a/tensorflow/core/util/autotune_maps/autotune_maps_utils.cc
+++ b/tensorflow/core/util/autotune_maps/autotune_maps_utils.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/util/autotune_maps/autotune_maps_utils.h"
 
+#include <string>
+
 #include "absl/strings/str_format.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/hash.h"
@@ -29,6 +31,7 @@ namespace autotune_maps_utils {
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 namespace {
+
 using ::stream_executor::gpu::GpuDeviceHandle;
 using ::stream_executor::gpu::GpuDriver;
 
@@ -75,11 +78,7 @@ std::string DeviceIdToIdentifier(int device_id) {
   // destruct in multi-thread setting.
   static const auto& map =
       *new std::vector<string>(GetDeviceIdToIdentifierMap());
-  if (device_id >= map.size()) {
-    return "Unknown Graphics Device";
-  } else {
-    return map[device_id];
-  }
+  return device_id < map.size() ? map[device_id] : "Unknown Graphics Device";
 }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/util/autotune_maps/autotune_serialize.cc b/tensorflow/core/util/autotune_maps/autotune_serialize.cc
index cd6bde2fa1018d..dc1e73aee7e98d 100644
--- a/tensorflow/core/util/autotune_maps/autotune_serialize.cc
+++ b/tensorflow/core/util/autotune_maps/autotune_serialize.cc
@@ -17,9 +17,11 @@ limitations under the License.
 #include "tensorflow/core/util/autotune_maps/autotune_serialize.h"
 
 #include <map>
+#include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/core/platform/str_util.h"
 #include "tensorflow/core/util/activation_mode.h"
 #include "tensorflow/core/util/autotune_maps/autotune_map.pb.h"
 #include "tensorflow/core/util/autotune_maps/autotune_maps_utils.h"
@@ -87,12 +89,16 @@ template <typename Op>
 Status PopulateConvMap(
     const ConvMapProto &m,
     AutotuneMap<ConvParameters, AutotuneEntry<Op>> *autotune_map) {
+  if (m.kv_pairs().size() == 0) {
+    return Status::OK();
+  }
   // Map device_id's to corresponding device_identifiers.
   std::vector<string> device_ids_map =
       autotune_maps_utils::GetDeviceIdToIdentifierMap();
   // Map device_identifiers to device_ids whose corresponding GPU devices have
   // the given device_identifier.
   std::unordered_map<string, std::vector<int>> device_identifiers_map;
+  bool devices_matched = false;
   for (const ConvMapProto::Entry &kv : m.kv_pairs()) {
     const ConvParametersProto &params_proto = kv.key();
     // Abort loading process whenever there is an entry whose version number
@@ -139,10 +145,26 @@ Status PopulateConvMap(
     } else {
       device_ids = iter->second;
     }
+
+    if (device_ids.empty()) {
+      LOG(WARNING) << "No matching devices found for "
+                   << params_proto.device_identifier() << "; existing devices: "
+                   << str_util::Join(device_ids_map, ", ");
+    } else {
+      devices_matched = true;
+    }
+
     for (int device_id : device_ids) {
       autotune_map->Insert(ConvParameters(device_id, params_proto), entry);
     }
   }
+
+  // When no matching devices are found, populating autotuning map will not
+  // happen. Instead of silently reporting an OK status, report an error back.
+  if (!devices_matched) {
+    return errors::NotFound("No matching devices found for ",
+                            str_util::Join(device_ids_map, ", "));
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/util/autotune_maps/autotune_serialize_test.cc b/tensorflow/core/util/autotune_maps/autotune_serialize_test.cc
index 48f857ed9c4f3d..89d22fef1b4554 100644
--- a/tensorflow/core/util/autotune_maps/autotune_serialize_test.cc
+++ b/tensorflow/core/util/autotune_maps/autotune_serialize_test.cc
@@ -56,11 +56,11 @@ TEST(AutotuneSerializeTest, Consistency) {
   TF_CHECK_OK(GpuDriver::Init());
   ResetAutotuneMaps();
   ConvParameters conv_params_example_a = {
-      /*batch_size=*/1,
+      /*batch=*/1,
       /*in_depths=*/1,
       /*in=*/{{1, 1}},
       /*data_format=*/TensorFormat::FORMAT_NCHW,
-      /*out_depth=*/1,
+      /*out_depths=*/1,
       /*filter=*/{{1, 1}},
       /*dilation=*/{{1, 1}},
       /*stride=*/{{1, 1}},
@@ -69,11 +69,11 @@ TEST(AutotuneSerializeTest, Consistency) {
       /*device_id=*/0,
       /*group_count=*/1};
   ConvParameters fused_params_example_a = {
-      /*batch_size=*/1,
+      /*batch=*/1,
       /*in_depths=*/1,
       /*in=*/{{1, 1}},
       /*data_format=*/TensorFormat::FORMAT_NCHW,
-      /*out_depth=*/1,
+      /*out_depths=*/1,
       /*filter=*/{{1, 1}},
       /*dilation=*/{{1, 1}},
       /*stride=*/{{1, 1}},
@@ -87,11 +87,11 @@ TEST(AutotuneSerializeTest, Consistency) {
                                  /*is_contrib=*/false},
   };
   ConvParameters contrib_fused_params_example_a = {
-      /*batch_size=*/1,
+      /*batch=*/1,
       /*in_depths=*/1,
       /*in=*/{{1, 1}},
       /*data_format=*/TensorFormat::FORMAT_NCHW,
-      /*out_depth=*/1,
+      /*out_depths=*/1,
       /*filter=*/{{1, 1}},
       /*dilation=*/{{1, 1}},
       /*stride=*/{{1, 1}},
@@ -104,8 +104,8 @@ TEST(AutotuneSerializeTest, Consistency) {
                                  se::dnn::ActivationMode::kRelu,
                                  /*is_contrib=*/true}};
 
-  AlgorithmDesc algorithm(/*algo_id=*/1, /*use_tensor_op=*/true);
-  AlgorithmDesc algorithm_no_scratch(/*algo_id=*/1, /*use_tensor_op=*/true);
+  AlgorithmDesc algorithm(/*algo_id=*/1, /*use_tensor_ops=*/true);
+  AlgorithmDesc algorithm_no_scratch(/*algo_id=*/1, /*use_tensor_ops=*/true);
   AutotuneEntry<se::dnn::ConvOp> example_a(algorithm, algorithm_no_scratch);
   ConvAutotuneMap::GetInstance()->Insert(conv_params_example_a, example_a);
   ConvAutotuneMap::GetInstance()->Insert(fused_params_example_a, example_a);
@@ -136,11 +136,11 @@ TEST(AutotuneSerializeTest, VersionControl) {
   ResetAutotuneMaps();
 
   ConvParameters fused_params_example_a = {
-      /*batch_size=*/1,
+      /*batch=*/1,
       /*in_depths=*/1,
       /*in=*/{{1, 1}},
       /*data_format=*/TensorFormat::FORMAT_NCHW,
-      /*out_depth=*/1,
+      /*out_depths=*/1,
       /*filter=*/{{1, 1}},
       /*dilation=*/{{1, 1}},
       /*stride=*/{{1, 1}},
@@ -154,8 +154,8 @@ TEST(AutotuneSerializeTest, VersionControl) {
                                  /*is_contrib=*/false},
       /*version=*/ConvParameters::kVersion - 1};
 
-  AlgorithmDesc algorithm(/*algo_id=*/1, /*use_tensor_op=*/true);
-  AlgorithmDesc algorithm_no_scratch(/*algo_id=*/1, /*use_tensor_op=*/true);
+  AlgorithmDesc algorithm(/*algo_id=*/1, /*use_tensor_ops=*/true);
+  AlgorithmDesc algorithm_no_scratch(/*algo_id=*/1, /*use_tensor_ops=*/true);
   AlgorithmConfig algorithm_config_example_a(algorithm, /*scratch_size=*/1,
                                              algorithm_no_scratch);
 
diff --git a/tensorflow/core/util/gpu_solvers.h b/tensorflow/core/util/gpu_solvers.h
index 5359a28a02568c..60a3e5985450d3 100644
--- a/tensorflow/core/util/gpu_solvers.h
+++ b/tensorflow/core/util/gpu_solvers.h
@@ -102,7 +102,8 @@ inline typename ROCmComplexT<T>::type* ROCmComplex(T* p) {
 // Template to give the Rocblas adjoint operation for real and complex types.
 template <typename T>
 rocblas_operation RocblasAdjointOp() {
-  return Eigen::NumTraits<T>::IsComplex ? rocblas_operation_conjugate_transpose : rocblas_operation_transpose;
+  return Eigen::NumTraits<T>::IsComplex ? rocblas_operation_conjugate_transpose
+                                        : rocblas_operation_transpose;
 }
 #endif
 
@@ -236,7 +237,6 @@ class GpuSolver {
 
   // LU factorization.
   // Computes LU factorization with partial pivoting P * A = L * U.
-
   template <typename Scalar>
   Status Getrf(int m, int n, Scalar* dev_A, int lda, int* dev_pivots,
                int* info);
@@ -268,6 +268,14 @@ class GpuSolver {
                       const Scalar* const host_a_inverse_dev_ptrs[], int ldainv,
                       DeviceLapackInfo* dev_lapack_info, int batch_size);
 
+  // Computes matrix inverses for a batch of small matrices with size n < 32.
+  // Returns Status::OK() if the kernel was launched successfully.
+  template <typename Scalar>
+  Status MatInvBatched(int n, const Scalar* const host_a_dev_ptrs[], int lda,
+                       const Scalar* const host_a_inverse_dev_ptrs[],
+                       int ldainv, DeviceLapackInfo* dev_lapack_info,
+                       int batch_size);
+
   // Cholesky factorization
   // Computes the Cholesky factorization A = L * L^H for a batch of small
   // matrices.
@@ -283,55 +291,36 @@ class GpuSolver {
 
   // QR factorization.
   // Computes QR factorization A = Q * R.
-  // Returns Status::OK() if the kernel was launched successfully.
-  // See: http://docs.nvidia.com/cuda/cusolver/#cuds-lt-t-gt-geqrf
   template <typename Scalar>
   Status Geqrf(int m, int n, Scalar* dev_A, int lda, Scalar* dev_tau,
                int* dev_lapack_info);
 
-
   // This function performs the matrix-matrix addition/transposition
   //   C = alpha * op(A) + beta * op(B).
-  // Returns Status::OK() if the kernel was launched successfully.  See:
-  // http://docs.nvidia.com/cuda/cublas/index.html#cublas-lt-t-gt-geam
-  // NOTE(ebrevdo): Does not support in-place transpose of non-square
-  // matrices.
   template <typename Scalar>
   Status Geam(rocblas_operation transa, rocblas_operation transb, int m, int n,
               const Scalar* alpha, /* host or device pointer */
               const Scalar* A, int lda,
               const Scalar* beta, /* host or device pointer */
-              const Scalar* B, int ldb, Scalar* C,
-              int ldc);
+              const Scalar* B, int ldb, Scalar* C, int ldc);
 
   // Overwrite matrix C by product of C and the unitary Householder matrix Q.
   // The Householder matrix Q is represented by the output from Geqrf in dev_a
   // and dev_tau.
-  // Returns Status::OK() if the kernel was launched successfully.
   template <typename Scalar>
-  Status Unmqr(rocblas_side side, rocblas_operation trans, int m, int n,
-               int k, const Scalar* dev_a, int lda, const Scalar* dev_tau,
+  Status Unmqr(rocblas_side side, rocblas_operation trans, int m, int n, int k,
+               const Scalar* dev_a, int lda, const Scalar* dev_tau,
                Scalar* dev_c, int ldc, int* dev_lapack_info);
 
   // Overwrites QR factorization produced by Geqrf by the unitary Householder
   // matrix Q. On input, the Householder matrix Q is represented by the output
   // from Geqrf in dev_a and dev_tau. On output, dev_a is overwritten with the
   // first n columns of Q. Requires m >= n >= 0.
-  // Returns Status::OK() if the kernel was launched successfully.
   template <typename Scalar>
   Status Ungqr(int m, int n, int k, Scalar* dev_a, int lda,
                const Scalar* dev_tau, int* dev_lapack_info);
 
-
-  // Computes matrix inverses for a batch of small matrices with size n < 32.
-  // Returns Status::OK() if the kernel was launched successfully.
-  template <typename Scalar>
-  Status MatInvBatched(int n, const Scalar* const host_a_dev_ptrs[], int lda,
-                       const Scalar* const host_a_inverse_dev_ptrs[],
-                       int ldainv, DeviceLapackInfo* dev_lapack_info,
-                       int batch_size);
-
-#else //GOOGLE_CUDA
+#else  // GOOGLE_CUDA
   // ====================================================================
   // Wrappers for cuSolverDN and cuBlas solvers start here.
   //
@@ -502,6 +491,7 @@ class GpuSolver {
                      const Scalar* const dev_Aarray[], int lda,
                      Scalar* dev_Barray[], int ldb, int batch_size);
 #endif
+
  private:
   OpKernelContext* context_;  // not owned.
 #if GOOGLE_CUDA
diff --git a/tensorflow/core/util/rocm_solvers.cc b/tensorflow/core/util/rocm_solvers.cc
index edbaccc0b467f2..d3278402707f26 100644
--- a/tensorflow/core/util/rocm_solvers.cc
+++ b/tensorflow/core/util/rocm_solvers.cc
@@ -222,7 +222,8 @@ void GpuSolver::CheckLapackInfoAndDeleteSolverAsync(
 #define TF_CALL_LAPACK_TYPES(m) \
   m(float, s) m(double, d) m(std::complex<float>, c) m(std::complex<double>, z)
 #define TF_CALL_LAPACK_TYPES_NO_COMPLEX(m) m(float, s) m(double, d)
-#define TF_CALL_LAPACK_TYPES_NO_REAL(m) m(std::complex<float>, c) m(std::complex<double>, z)
+#define TF_CALL_LAPACK_TYPES_NO_REAL(m) \
+  m(std::complex<float>, c) m(std::complex<double>, z)
 
 #define BLAS_SOLVER_FN(method, type_prefix) \
   wrap::rocblas##_##type_prefix##method
@@ -244,81 +245,92 @@ void GpuSolver::CheckLapackInfoAndDeleteSolverAsync(
 
 TF_CALL_LAPACK_TYPES(GETRF_INSTANCE);
 
-#define GEQRF_INSTANCE(Scalar, type_prefix)                                      \
-  template <>                                                                    \
-  Status GpuSolver::Geqrf(int m, int n, Scalar* dev_A, int lda, Scalar* dev_tau, \
-                int* dev_lapack_info){                                           \
-      mutex_lock lock(handle_map_mutex);                                         \
-      using ROCmScalar = typename ROCmComplexT<Scalar>::type;                    \
-      TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(geqrf, type_prefix)(                  \
-          rocm_blas_handle_, m, n, reinterpret_cast<ROCmScalar*>(dev_A), lda,    \
-          reinterpret_cast<ROCmScalar*>(dev_tau)));             \
-      return Status::OK();                                                       \
+#define POTRF_INSTANCE(Scalar, type_prefix)                                    \
+  template <>                                                                  \
+  Status GpuSolver::Potrf<Scalar>(rocblas_fill uplo, int n, Scalar* dev_A,     \
+                                  int lda, int* dev_lapack_info) {             \
+    mutex_lock lock(handle_map_mutex);                                         \
+    using ROCmScalar = typename ROCmComplexT<Scalar>::type;                    \
+    TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(potrf, type_prefix)(                  \
+        rocm_blas_handle_, uplo, n, reinterpret_cast<ROCmScalar*>(dev_A), lda, \
+        dev_lapack_info));                                                     \
+    return Status::OK();                                                       \
+  }
+
+#define GEQRF_INSTANCE(Scalar, type_prefix)                                 \
+  template <>                                                               \
+  Status GpuSolver::Geqrf(int m, int n, Scalar* dev_A, int lda,             \
+                          Scalar* dev_tau, int* dev_lapack_info) {          \
+    mutex_lock lock(handle_map_mutex);                                      \
+    using ROCmScalar = typename ROCmComplexT<Scalar>::type;                 \
+    TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(geqrf, type_prefix)(               \
+        rocm_blas_handle_, m, n, reinterpret_cast<ROCmScalar*>(dev_A), lda, \
+        reinterpret_cast<ROCmScalar*>(dev_tau)));                           \
+    return Status::OK();                                                    \
   }
 
 TF_CALL_LAPACK_TYPES(GEQRF_INSTANCE);
 
-#define UMMQR_INSTANCE(Scalar, type_prefix)                                          \
-  template <>                                                                        \
-  Status GpuSolver::Unmqr(rocblas_side side, rocblas_operation trans, int m, int n,  \
-               int k, const Scalar* dev_a, int lda, const Scalar* dev_tau,           \
-               Scalar* dev_c, int ldc, int* dev_lapack_info){                         \
-      mutex_lock lock(handle_map_mutex);                                              \
-      using ROCmScalar = typename ROCmComplexT<Scalar>::type;                          \
-      ScratchSpace<uint8> dev_a_copy =                                                 \
-        this->GetScratchSpace<uint8>(sizeof(ROCmScalar*) * m*k, "",                    \
-        /*on host */ false);                                                           \
-      if (!CopyHostToDevice(context_, dev_a_copy.mutable_data(), dev_a,                \
-                          dev_a_copy.bytes())) {                                        \
-      return errors::Internal("Unmqr: Failed to copy ptrs to device");                  \
-      }                                                                                 \
-      ScratchSpace<uint8> dev_tau_copy =                                                \
-        this->GetScratchSpace<uint8>(sizeof(ROCmScalar*) *k*n, "",                     \
-        /*on host */ false);                                                            \
-      if (!CopyHostToDevice(context_, dev_tau_copy.mutable_data(), dev_tau,             \
-                          dev_tau_copy.bytes())) {                                      \
-      return errors::Internal("Unmqr: Failed to copy ptrs to device");                  \
-      }                                                                                   \
-      TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(unmqr, type_prefix)(                               \
-          rocm_blas_handle_,side,trans, m, n, k, reinterpret_cast<ROCmScalar*>(dev_a_copy.mutable_data()), lda,    \
-          reinterpret_cast<ROCmScalar*>(dev_tau_copy.mutable_data()),reinterpret_cast<ROCmScalar*>(dev_c), ldc));             \
-      return Status::OK();    \
-}
+#define UMMQR_INSTANCE(Scalar, type_prefix)                                  \
+  template <>                                                                \
+  Status GpuSolver::Unmqr(rocblas_side side, rocblas_operation trans, int m, \
+                          int n, int k, const Scalar* dev_a, int lda,        \
+                          const Scalar* dev_tau, Scalar* dev_c, int ldc,     \
+                          int* dev_lapack_info) {                            \
+    mutex_lock lock(handle_map_mutex);                                       \
+    using ROCmScalar = typename ROCmComplexT<Scalar>::type;                  \
+    ScratchSpace<uint8> dev_a_copy = this->GetScratchSpace<uint8>(           \
+        sizeof(ROCmScalar*) * m * k, "", /*on host */ false);                \
+    if (!CopyHostToDevice(context_, dev_a_copy.mutable_data(), dev_a,        \
+                          dev_a_copy.bytes())) {                             \
+      return errors::Internal("Unmqr: Failed to copy ptrs to device");       \
+    }                                                                        \
+    ScratchSpace<uint8> dev_tau_copy = this->GetScratchSpace<uint8>(         \
+        sizeof(ROCmScalar*) * k * n, "", /*on host */ false);                \
+    if (!CopyHostToDevice(context_, dev_tau_copy.mutable_data(), dev_tau,    \
+                          dev_tau_copy.bytes())) {                           \
+      return errors::Internal("Unmqr: Failed to copy ptrs to device");       \
+    }                                                                        \
+    TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(unmqr, type_prefix)(                \
+        rocm_blas_handle_, side, trans, m, n, k,                             \
+        reinterpret_cast<ROCmScalar*>(dev_a_copy.mutable_data()), lda,       \
+        reinterpret_cast<ROCmScalar*>(dev_tau_copy.mutable_data()),          \
+        reinterpret_cast<ROCmScalar*>(dev_c), ldc));                         \
+    return Status::OK();                                                     \
+  }
 
 TF_CALL_LAPACK_TYPES_NO_REAL(UMMQR_INSTANCE);
 
-#define UNGQR_INSTANCE(Scalar, type_prefix)                                          \
-  template <>                                                                        \
-  Status GpuSolver::Ungqr(int m, int n, int k, Scalar* dev_a, int lda,               \
-               const Scalar* dev_tau, int* dev_lapack_info){                         \
-      mutex_lock lock(handle_map_mutex);                                              \
-      using ROCmScalar = typename ROCmComplexT<Scalar>::type;                          \
-      ScratchSpace<uint8> dev_tau_copy =                                                \
-        this->GetScratchSpace<uint8>(sizeof(ROCmScalar*) *k*n, "",                     \
-        /*on host */ false);                                                            \
-      if (!CopyHostToDevice(context_, dev_tau_copy.mutable_data(), dev_tau,             \
-                          dev_tau_copy.bytes())) {                                      \
-      return errors::Internal("Ungqr: Failed to copy ptrs to device");                  \
-      }                                                                                   \
-      TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(ungqr, type_prefix)(                               \
-          rocm_blas_handle_, m, n, k, reinterpret_cast<ROCmScalar*>(dev_a), lda,    \
-          reinterpret_cast<ROCmScalar*>(dev_tau_copy.mutable_data())));             \
-      return Status::OK();    \
-}
+#define UNGQR_INSTANCE(Scalar, type_prefix)                                    \
+  template <>                                                                  \
+  Status GpuSolver::Ungqr(int m, int n, int k, Scalar* dev_a, int lda,         \
+                          const Scalar* dev_tau, int* dev_lapack_info) {       \
+    mutex_lock lock(handle_map_mutex);                                         \
+    using ROCmScalar = typename ROCmComplexT<Scalar>::type;                    \
+    ScratchSpace<uint8> dev_tau_copy = this->GetScratchSpace<uint8>(           \
+        sizeof(ROCmScalar*) * k * n, "", /*on host */ false);                  \
+    if (!CopyHostToDevice(context_, dev_tau_copy.mutable_data(), dev_tau,      \
+                          dev_tau_copy.bytes())) {                             \
+      return errors::Internal("Ungqr: Failed to copy ptrs to device");         \
+    }                                                                          \
+    TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(ungqr, type_prefix)(                  \
+        rocm_blas_handle_, m, n, k, reinterpret_cast<ROCmScalar*>(dev_a), lda, \
+        reinterpret_cast<ROCmScalar*>(dev_tau_copy.mutable_data())));          \
+    return Status::OK();                                                       \
+  }
 
 TF_CALL_LAPACK_TYPES_NO_REAL(UNGQR_INSTANCE);
 
-
-#define POTRF_INSTANCE(Scalar, type_prefix)                                   \
-  template <>                                                                 \
-  Status GpuSolver::Potrf<Scalar>(rocblas_fill uplo, int n, Scalar* dev_A,    \
-  int lda, int* dev_lapack_info) {                                            \
-    mutex_lock lock(handle_map_mutex);                                        \
-    using ROCmScalar = typename ROCmComplexT<Scalar>::type;                   \
-    TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(potrf, type_prefix)(                 \
-        rocm_blas_handle_, uplo, n, reinterpret_cast<ROCmScalar*>(dev_A),     \
-        lda, dev_lapack_info));                                               \
-    return Status::OK();                                                      \
+#define POTRF_INSTANCE(Scalar, type_prefix)                                    \
+  template <>                                                                  \
+  Status GpuSolver::Potrf<Scalar>(rocblas_fill uplo, int n, Scalar* dev_A,     \
+                                  int lda, int* dev_lapack_info) {             \
+    mutex_lock lock(handle_map_mutex);                                         \
+    using ROCmScalar = typename ROCmComplexT<Scalar>::type;                    \
+    TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(potrf, type_prefix)(                  \
+        rocm_blas_handle_, uplo, n, reinterpret_cast<ROCmScalar*>(dev_A), lda, \
+        dev_lapack_info));                                                     \
+    return Status::OK();                                                       \
   }
 
 TF_CALL_LAPACK_TYPES(POTRF_INSTANCE);
@@ -360,43 +372,6 @@ TF_CALL_LAPACK_TYPES(GETRS_INSTANCE);
 
 TF_CALL_LAPACK_TYPES(GETRF_BATCHED_INSTANCE);
 
-#define GETRI_BATCHED_INSTANCE(Scalar, type_prefix)                            \
-  template <>                                                                  \
-  Status GpuSolver::GetriBatched<Scalar>(                                      \
-                      int n, const Scalar* const host_a_dev_ptrs[], int lda, \
-                      const int* dev_pivots, const Scalar* const host_a_inverse_dev_ptrs[], \
-                      int ldainv, DeviceLapackInfo* dev_lapack_info, int batch_size) {                                                  \
-    mutex_lock lock(handle_map_mutex);                                         \
-    rocblas_stride stride = n;                                                 \
-    using ROCmScalar = typename ROCmComplexT<Scalar>::type;                   \
-    ScratchSpace<uint8> dev_a = this->GetScratchSpace<uint8>(                 \
-        sizeof(ROCmScalar*) * batch_size, "", /*on host */ false);            \
-    if (!CopyHostToDevice(context_, dev_a.mutable_data(), host_a_dev_ptrs,    \
-                          dev_a.bytes())) {                                   \
-      return errors::Internal("GetriBatched: Failed to copy ptrs to device"); \
-    }                                                                                   \
-    ScratchSpace<uint8> dev_a_inverse = this->GetScratchSpace<uint8>(                 \
-        sizeof(ROCmScalar*) * batch_size, "", /*on host */ false);            \
-    if (!CopyHostToDevice(context_, dev_a_inverse.mutable_data(), host_a_inverse_dev_ptrs,    \
-                          dev_a_inverse.bytes())) {                                   \
-      return errors::Internal("GetriBatched: Failed to copy ptrs to device"); \
-    }                                                                          \
-    ScratchSpace<uint8> pivots = this->GetScratchSpace<uint8>(                 \
-        sizeof(ROCmScalar*) * batch_size, "", /*on host */ false);            \
-    if (!CopyHostToDevice(context_, pivots.mutable_data(), dev_pivots,    \
-                          pivots.bytes())) {                                   \
-      return errors::Internal("GetriBatched: Failed to copy ptrs to device"); \
-    }                                                                          \
-    TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(getri_batched, type_prefix)(          \
-        rocm_blas_handle_, n,                                                  \
-        reinterpret_cast<ROCmScalar**>(dev_a.mutable_data()), lda,              \
-        reinterpret_cast<int*>(pivots.mutable_data()),                           \
-        stride, dev_lapack_info->mutable_data(), batch_size));                   \
-    return Status::OK();                                                       \
-  }
-
-TF_CALL_LAPACK_TYPES(GETRI_BATCHED_INSTANCE);
-
 #define POTRF_BATCHED_INSTANCE(Scalar, type_prefix)                           \
   template <>                                                                 \
   Status GpuSolver::PotrfBatched<Scalar>(                                     \
@@ -449,6 +424,43 @@ TF_CALL_LAPACK_TYPES(POTRF_BATCHED_INSTANCE);
 
 TF_CALL_LAPACK_TYPES(GETRS_BATCHED_INSTANCE);
 
+#define GETRI_BATCHED_INSTANCE(Scalar, type_prefix)                           \
+  template <>                                                                 \
+  Status GpuSolver::GetriBatched<Scalar>(                                     \
+      int n, const Scalar* const host_a_dev_ptrs[], int lda,                  \
+      const int* dev_pivots, const Scalar* const host_a_inverse_dev_ptrs[],   \
+      int ldainv, DeviceLapackInfo* dev_lapack_info, int batch_size) {        \
+    mutex_lock lock(handle_map_mutex);                                        \
+    rocblas_stride stride = n;                                                \
+    using ROCmScalar = typename ROCmComplexT<Scalar>::type;                   \
+    ScratchSpace<uint8> dev_a = this->GetScratchSpace<uint8>(                 \
+        sizeof(ROCmScalar*) * batch_size, "", /*on host */ false);            \
+    if (!CopyHostToDevice(context_, dev_a.mutable_data(), host_a_dev_ptrs,    \
+                          dev_a.bytes())) {                                   \
+      return errors::Internal("GetriBatched: Failed to copy ptrs to device"); \
+    }                                                                         \
+    ScratchSpace<uint8> dev_a_inverse = this->GetScratchSpace<uint8>(         \
+        sizeof(ROCmScalar*) * batch_size, "", /*on host */ false);            \
+    if (!CopyHostToDevice(context_, dev_a_inverse.mutable_data(),             \
+                          host_a_inverse_dev_ptrs, dev_a_inverse.bytes())) {  \
+      return errors::Internal("GetriBatched: Failed to copy ptrs to device"); \
+    }                                                                         \
+    ScratchSpace<uint8> pivots = this->GetScratchSpace<uint8>(                \
+        sizeof(ROCmScalar*) * batch_size, "", /*on host */ false);            \
+    if (!CopyHostToDevice(context_, pivots.mutable_data(), dev_pivots,        \
+                          pivots.bytes())) {                                  \
+      return errors::Internal("GetriBatched: Failed to copy ptrs to device"); \
+    }                                                                         \
+    TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(getri_batched, type_prefix)(         \
+        rocm_blas_handle_, n,                                                 \
+        reinterpret_cast<ROCmScalar**>(dev_a.mutable_data()), lda,            \
+        reinterpret_cast<int*>(pivots.mutable_data()), stride,                \
+        dev_lapack_info->mutable_data(), batch_size));                        \
+    return Status::OK();                                                      \
+  }
+
+TF_CALL_LAPACK_TYPES(GETRI_BATCHED_INSTANCE);
+
 // Allocates a temporary tensor. The GpuSolver object maintains a
 // TensorReference to the underlying Tensor to prevent it from being deallocated
 // prematurely.
@@ -531,7 +543,7 @@ Status MatInvBatchedImpl(GpuExecutor* gpu_executor, SolverFnT solver,
   return Status::OK();
 }
 
-#define GEAM_INSTANCE(Scalar, type_prefix)                                    \
+#define MATINVBATCHED_INSTANCE(Scalar, type_prefix)                           \
   template <>                                                                 \
   Status GpuSolver::MatInvBatched<Scalar>(                                    \
       int n, const Scalar* const host_a_dev_ptrs[], int lda,                  \
@@ -550,6 +562,41 @@ Status MatInvBatchedImpl(GpuExecutor* gpu_executor, SolverFnT solver,
         host_a_inverse_dev_ptrs, ldainv, dev_lapack_info, batch_size);        \
   }
 
+template <typename Scalar, typename SolverFnT>
+Status GeamImpl(GpuExecutor* gpu_executor, SolverFnT solver,
+                rocblas_handle rocm_blas_handle, rocblas_operation transa,
+                rocblas_operation transb, int m, int n, const Scalar* alpha,
+                /* host or device pointer */ const Scalar* A, int lda,
+                const Scalar* beta,
+                /* host or device pointer */ const Scalar* B, int ldb,
+                Scalar* C, int ldc) {
+  mutex_lock lock(handle_map_mutex);
+  using ROCmScalar = typename ROCmComplexT<Scalar>::type;
+
+  ScopedActivateExecutorContext sac{gpu_executor};
+  TF_RETURN_IF_ROCBLAS_ERROR(solver(rocm_blas_handle, transa, transb, m, n,
+                                    reinterpret_cast<const ROCmScalar*>(alpha),
+                                    reinterpret_cast<const ROCmScalar*>(A), lda,
+                                    reinterpret_cast<const ROCmScalar*>(beta),
+                                    reinterpret_cast<const ROCmScalar*>(B), ldb,
+                                    reinterpret_cast<ROCmScalar*>(C), ldc));
+  return Status::OK();
+}
+
+#define GEAM_INSTANCE(Scalar, type_prefix)                                    \
+  template <>                                                                 \
+  Status GpuSolver::Geam<Scalar>(                                             \
+      rocblas_operation transa, rocblas_operation transb, int m, int n,       \
+      const Scalar* alpha, const Scalar* A, int lda, const Scalar* beta,      \
+      const Scalar* B, int ldb, Scalar* C, int ldc) {                         \
+    GpuExecutor* gpu_executor = static_cast<GpuExecutor*>(                    \
+        context_->op_device_context()->stream()->parent()->implementation()); \
+    return GeamImpl(gpu_executor, BLAS_SOLVER_FN(geam, type_prefix),          \
+                    rocm_blas_handle_, transa, transb, m, n, alpha, A, lda,   \
+                    beta, B, ldb, C, ldc);                                    \
+  }
+
+TF_CALL_LAPACK_TYPES_NO_COMPLEX(GEAM_INSTANCE);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_USE_ROCM
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 03d94772a66c3f..31124a40ce9788 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -15491,27 +15491,43 @@ func DebugNumericSummaryV2(scope *Scope, input tf.Output, optional ...DebugNumer
 	return op.Output(0)
 }
 
+// XlaConvV2Attr is an optional argument to XlaConvV2.
+type XlaConvV2Attr func(optionalAttr)
+
+// XlaConvV2BatchGroupCount sets the optional batch_group_count attribute to value.
+//
+// value: number of batch groups or grouped filters.
+// If not specified, defaults to 1
+func XlaConvV2BatchGroupCount(value int64) XlaConvV2Attr {
+	return func(m optionalAttr) {
+		m["batch_group_count"] = value
+	}
+}
+
 // Wraps the XLA ConvGeneralDilated operator, documented at
 //
 //  https://www.tensorflow.org/performance/xla/operation_semantics#conv_convolution
 // .
 //
 // Arguments:
-//	lhs: the input tensor
-//	rhs: the kernel tensor
-//	window_strides: the inter-window strides
-//	padding: the padding to apply at the start and end of each input dimensions
+//	lhs: input tensor
+//	rhs: kernel tensor
+//	window_strides: inter-window strides
+//	padding: padding to apply at the start and end of each input dimensions
 //	lhs_dilation: dilation to apply between input elements
 //	rhs_dilation: dilation to apply between kernel elements
 //	feature_group_count: number of feature groups for grouped convolution.
-//	dimension_numbers: a serialized xla::ConvolutionDimensionNumbers proto.
-//	precision_config: a serialized xla::PrecisionConfig proto.
-//	preferred_element_type: The type of the tensor.
-func XlaConvV2(scope *Scope, lhs tf.Output, rhs tf.Output, window_strides tf.Output, padding tf.Output, lhs_dilation tf.Output, rhs_dilation tf.Output, feature_group_count tf.Output, dimension_numbers string, precision_config string, preferred_element_type tf.DataType) (output tf.Output) {
+//	dimension_numbers: serialized xla::ConvolutionDimensionNumbers proto.
+//	precision_config: serialized xla::PrecisionConfig proto.
+//	preferred_element_type: type of the tensor.
+func XlaConvV2(scope *Scope, lhs tf.Output, rhs tf.Output, window_strides tf.Output, padding tf.Output, lhs_dilation tf.Output, rhs_dilation tf.Output, feature_group_count tf.Output, dimension_numbers string, precision_config string, preferred_element_type tf.DataType, optional ...XlaConvV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"dimension_numbers": dimension_numbers, "precision_config": precision_config, "preferred_element_type": preferred_element_type}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
 		Type: "XlaConvV2",
 		Input: []tf.Input{
diff --git a/tensorflow/lite/CMakeLists.txt b/tensorflow/lite/CMakeLists.txt
index 6c2a8c5fe214bd..aa1edb37b40498 100644
--- a/tensorflow/lite/CMakeLists.txt
+++ b/tensorflow/lite/CMakeLists.txt
@@ -143,6 +143,8 @@ find_package(fft2d REQUIRED)
 find_package(flatbuffers REQUIRED)
 find_package(gemmlowp REQUIRED)
 find_package(neon2sse REQUIRED)
+find_package(clog REQUIRED)
+find_package(cpuinfo REQUIRED)  #CPUINFO is used by XNNPACK and RUY library
 find_package(ruy REQUIRED)
 # Generate TensorFlow Lite FlatBuffer code.
 # We used to have an actual compilation logic with flatc but decided to use
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 8721729db577c6..cd36cf457d243f 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -330,7 +330,8 @@ cc_library(
     srcs = ["util.cc"],
     hdrs = ["util.h"],
     features = tf_features_nolayering_check_if_android_or_ios(),
-    visibility = default_visibility + ["//third_party/fcp/client:__subpackages__"],
+    #TODO(b/206038955): Consider restrict the visibility to '//third_party/fcp/client:__subpackages__'.
+    visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/lite/c:common",
         "//tensorflow/lite:kernel_api",
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index 4bb5641f9f49b8..3561ce4fe6ff94 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -735,7 +735,8 @@ TfLiteStatus DelegateKernel::Eval(TfLiteContext* context, TfLiteNode* node) {
 
     if (op_data_->cancellation_manager != nullptr &&
         op_data_->cancellation_manager->IsCancelled()) {
-      TF_LITE_KERNEL_LOG(context, "Client requested cancel during Invoke()");
+      TF_LITE_KERNEL_LOG(context,
+                         "Client requested cancel during DelegateKernel::Eval");
       return kTfLiteError;
     }
 
diff --git a/tensorflow/lite/delegates/flex/training/BUILD b/tensorflow/lite/delegates/flex/training/BUILD
index cb07dc26729015..34ea45d3b58d7d 100644
--- a/tensorflow/lite/delegates/flex/training/BUILD
+++ b/tensorflow/lite/delegates/flex/training/BUILD
@@ -61,6 +61,8 @@ tf_cc_test(
     ],
     tags = [
         "no_gpu",  # GPU + flex is not supported.
+        "no_mac",
+        "no_windows",
     ],
     deps = [
         ":training_delegate",
diff --git a/tensorflow/lite/delegates/flex/training/training_delegate.cc b/tensorflow/lite/delegates/flex/training/training_delegate.cc
index 7c802edbb113c8..74b1a1c7372adb 100644
--- a/tensorflow/lite/delegates/flex/training/training_delegate.cc
+++ b/tensorflow/lite/delegates/flex/training/training_delegate.cc
@@ -18,13 +18,6 @@ limitations under the License.
 namespace tflite {
 namespace flex {
 
-// Corresponding weak declaration found in lite/interpreter_builder.cc.
-#if TFLITE_HAS_ATTRIBUTE_WEAK
-// If weak symbol is not supported (Windows), it can use
-// TF_AcquireFlexDelegate() path instead.
-TfLiteDelegateUniquePtr AcquireFlexDelegate() { return {nullptr, nullptr}; }
-#endif
-
 TrainingFlexDelegate::TrainingFlexDelegate()
     : delegate_(FlexDelegate::Create()) {
   cancellation_manager_ = absl::make_unique<tensorflow::CancellationManager>();
@@ -46,15 +39,3 @@ bool TrainingFlexDelegate::ShouldCancel(void* data) {
 
 }  // namespace flex
 }  // namespace tflite
-
-// Exported C interface function which is used by AcquireFlexDelegate() at
-// interpreter_builder.cc. To export the function name globally, the function
-// name must be matched with patterns in tf_version_script.lds. In Android, we
-// don't use this feature so skip building.
-#if !defined(__ANDROID__)
-extern "C" {
-TFL_CAPI_EXPORT tflite::TfLiteDelegateUniquePtr TF_AcquireFlexDelegate() {
-  return {nullptr, nullptr};
-}
-}  // extern "C"
-#endif  // !defined(__ANDROID__)
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
index 755078b9e4943c..d86df9f746876b 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <array>
 #include <map>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
@@ -180,12 +181,14 @@ ProfilingCommandQueue::ProfilingCommandQueue(cl_command_queue queue)
 ProfilingCommandQueue::ProfilingCommandQueue(ProfilingCommandQueue&& queue)
     : CLCommandQueue(std::move(queue)),
       events_(std::move(queue.events_)),
+      number_of_dispatches_(std::move(queue.number_of_dispatches_)),
       current_label_(std::move(queue.current_label_)) {}
 
 ProfilingCommandQueue& ProfilingCommandQueue::operator=(
     ProfilingCommandQueue&& queue) {
   if (this != &queue) {
     events_ = std::move(queue.events_);
+    number_of_dispatches_ = std::move(queue.number_of_dispatches_);
     current_label_ = std::move(queue.current_label_);
     CLCommandQueue::operator=(std::move(queue));
   }
@@ -196,12 +199,16 @@ void ProfilingCommandQueue::SetEventsLabel(const std::string& name) {
   current_label_ = name;
 }
 
-void ProfilingCommandQueue::ResetMeasurements() { events_.clear(); }
+void ProfilingCommandQueue::ResetMeasurements() {
+  events_.clear();
+  number_of_dispatches_.clear();
+}
 
 absl::Status ProfilingCommandQueue::Dispatch(const CLKernel& kernel,
                                              const int3& work_groups_count,
                                              const int3& work_group_size) {
   events_.push_back(CLEvent());
+  number_of_dispatches_.push_back(1);
   RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count,
                                            work_group_size,
                                            &events_[events_.size() - 1]));
@@ -209,13 +216,56 @@ absl::Status ProfilingCommandQueue::Dispatch(const CLKernel& kernel,
   return absl::OkStatus();
 }
 
+absl::Status ProfilingCommandQueue::DispatchNTimes(
+    const CLKernel& kernel, const int3& work_groups_count,
+    const int3& work_group_size, int n, int flush_period) {
+  number_of_dispatches_.push_back(n);
+  if (n == 1) {
+    events_.push_back(CLEvent());
+    RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count,
+                                             work_group_size,
+                                             &events_[events_.size() - 1]));
+    events_.back().SetName(current_label_);
+  } else {
+    events_.push_back(CLEvent());
+    events_.push_back(CLEvent());
+    RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count,
+                                             work_group_size,
+                                             &events_[events_.size() - 2]));
+    for (int i = 1; i < n - 1; ++i) {
+      RETURN_IF_ERROR(
+          CLCommandQueue::Dispatch(kernel, work_groups_count, work_group_size));
+      if (flush_period && i % flush_period == 0) {
+        clFlush(queue_);
+      }
+    }
+    RETURN_IF_ERROR(CLCommandQueue::Dispatch(kernel, work_groups_count,
+                                             work_group_size,
+                                             &events_[events_.size() - 1]));
+    clFlush(queue_);
+    events_[events_.size() - 2].SetName(current_label_);
+    events_[events_.size() - 1].SetName(current_label_);
+  }
+  return absl::OkStatus();
+}
+
 ProfilingInfo ProfilingCommandQueue::GetProfilingInfo() const {
   ProfilingInfo result;
-  result.dispatches.resize(events_.size());
-  for (int i = 0; i < events_.size(); ++i) {
-    result.dispatches[i].label = events_[i].GetName();
-    result.dispatches[i].duration =
-        absl::Nanoseconds(events_[i].GetEventTimeNs());
+  result.dispatches.resize(number_of_dispatches_.size());
+  int events_counter = 0;
+  for (int i = 0; i < number_of_dispatches_.size(); ++i) {
+    result.dispatches[i].label = events_[events_counter].GetName();
+    if (number_of_dispatches_[i] == 1) {
+      result.dispatches[i].duration =
+          absl::Nanoseconds(events_[events_counter].GetEventTimeNs());
+      events_counter += 1;
+    } else {
+      result.dispatches[i].duration =
+          absl::Nanoseconds(events_[events_counter + 1].GetFinishedTimeNs() -
+                            events_[events_counter].GetStartedTimeNs()) /
+          number_of_dispatches_[i];
+      events_counter += 2;
+    }
   }
   return result;
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
index 075e99bca46210..897dc959747844 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_command_queue.h
@@ -91,6 +91,12 @@ class ProfilingCommandQueue : public CLCommandQueue {
   absl::Status Dispatch(const CLKernel& kernel, const int3& work_groups_count,
                         const int3& work_group_size) override;
 
+  // for better profiling
+  absl::Status DispatchNTimes(const CLKernel& kernel,
+                              const int3& work_groups_count,
+                              const int3& work_group_size, int n,
+                              int flush_period = 0);
+
   // will write index for fastest work_group among work_group_sizes
   absl::Status GetBestWorkGroupIndex(const CLKernel& kernel,
                                      const GpuInfo& gpu_info,
@@ -116,6 +122,7 @@ class ProfilingCommandQueue : public CLCommandQueue {
 
  private:
   std::vector<CLEvent> events_;
+  std::vector<int> number_of_dispatches_;
   std::string current_label_;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_operation.cc b/tensorflow/lite/delegates/gpu/cl/cl_operation.cc
index 775ad6833386f2..76c2de7e10afe4 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_operation.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_operation.cc
@@ -180,6 +180,13 @@ absl::Status ClOperation::UpdateParams() {
   return absl::OkStatus();
 }
 
+void ClOperation::SetWorkGroupSize(const int3& work_group_size) {
+  operation_->work_group_size_ = work_group_size;
+  operation_->work_groups_count_ = GetWorkGroupsCount(
+      operation_->grid_dimension_, operation_->grid_size_,
+      operation_->work_group_size_, operation_->work_group_launch_order_);
+}
+
 absl::Status ClOperation::Compile(const CreationContext& creation_context) {
   operation_->code_ =
       GetCommonOpenCLDefines(operation_->definition_.precision) +
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_operation.h b/tensorflow/lite/delegates/gpu/cl/cl_operation.h
index e71b54c1bbd5e9..200d51ad98f3c4 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_operation.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_operation.h
@@ -74,6 +74,14 @@ class ClOperation {
                            operation_->work_group_size_);
   }
 
+  // for better profiling
+  absl::Status AddToQueueNTimes(ProfilingCommandQueue* queue, int n,
+                                int flush_period = 0) {
+    RETURN_IF_ERROR(cl_args_.Bind(kernel_.kernel()));
+    return queue->DispatchNTimes(kernel_, operation_->work_groups_count_,
+                                 operation_->work_group_size_, n, flush_period);
+  }
+
   absl::Status Tune(TuningType tuning_type, const GpuInfo& gpu_info,
                     ProfilingCommandQueue* profiling_queue);
 
@@ -90,6 +98,10 @@ class ClOperation {
     cl_args_.MoveObjectRefsIn(&operation_->args_);
   }
 
+  int3 GetWorkGroupSize() const { return operation_->work_group_size_; }
+
+  void SetWorkGroupSize(const int3& work_group_size);
+
  private:
   std::unique_ptr<GPUOperation> operation_;
   CLKernel kernel_;
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
index 28bdd128c8f56a..59105075258dff 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -186,6 +186,46 @@ class TensorReserver {
   ValueId next_;
 };
 
+absl::Status CheckExternalTensorDescription(const GpuInfo& gpu_info,
+                                            const TensorDescriptor& tensor_desc,
+                                            const BHWC& shape,
+                                            DataType data_type) {
+  if (tensor_desc.data_type != data_type) {
+    return absl::InvalidArgumentError(
+        "Global precision and precision of predefined/external tensors must be "
+        "synchronized.");
+  }
+  const bool tensor_supported_layout = tensor_desc.layout == Layout::HWDC ||
+                                       tensor_desc.layout == Layout::BHWDC ||
+                                       tensor_desc.layout == Layout::HWC ||
+                                       tensor_desc.layout == Layout::BHWC;
+  if (!tensor_supported_layout) {
+    return absl::InvalidArgumentError(
+        "Currently no support of this layouts for spatial tensors.");
+  }
+  const bool has_depth =
+      tensor_desc.layout == Layout::HWDC || tensor_desc.layout == Layout::BHWDC;
+  if (has_depth) {
+    return absl::InvalidArgumentError(
+        "Currently no support of Depth dimension in predefined/external "
+        "tensors.");
+  }
+  const bool has_batch =
+      tensor_desc.layout == Layout::BHWC || tensor_desc.layout == Layout::BHWDC;
+  if (has_batch && shape.b == 1) {
+    return absl::InvalidArgumentError("Wrong layout, batch mismatch.");
+  }
+  if (!has_batch && shape.b != 1) {
+    return absl::InvalidArgumentError("Wrong layout, batch mismatch.");
+  }
+  if (!CanCreateTensorWithShape(gpu_info, shape, tensor_desc).ok()) {
+    return absl::UnavailableError(
+        "Current device can not allocate tensor with this shape for "
+        "predefined/external descriptor.");
+  }
+  return absl::OkStatus();
+}
+
 absl::Status ReserveGraphTensors(
     const InferenceContext::CreateInferenceInfo& create_info,
     const GpuInfo& gpu_info, const GraphFloat32& graph,
@@ -194,22 +234,36 @@ absl::Status ReserveGraphTensors(
   auto tensors = graph.values();
   auto data_type = DeduceDataTypeFromPrecision(create_info.precision);
   for (auto& t : tensors) {
-    TensorStorageType storage_type = create_info.storage_type;
     const auto shape = graph.GetValue(t->id)->tensor.shape;
-    Layout layout = shape.b == 1 ? Layout::HWC : Layout::BHWC;
-    if (graph.IsGraphInput(t->id) || graph.IsGraphOutput(t->id)) {
-      if (shape.c < 4 &&
-          CanCreateTensorWithShape(
-              gpu_info, shape,
-              TensorDescriptor{data_type, TensorStorageType::SINGLE_TEXTURE_2D,
-                               layout})
-              .ok()) {
-        storage_type = TensorStorageType::SINGLE_TEXTURE_2D;
+    auto it_immutable_external =
+        create_info.external_immutable_tensors.find(t->id);
+    TensorDescriptor tensor_desc;
+    if (it_immutable_external != create_info.external_immutable_tensors.end()) {
+      if (!(graph.IsGraphInput(t->id) || graph.IsGraphOutput(t->id))) {
+        return absl::InvalidArgumentError(
+            "Currently external tensors can be used only for graph "
+            "inputs/outputs");
       }
+      tensor_desc = it_immutable_external->second->GetDescriptor();
+      RETURN_IF_ERROR(CheckExternalTensorDescription(gpu_info, tensor_desc,
+                                                     shape, data_type));
+    } else {
+      TensorStorageType storage_type = create_info.storage_type;
+      Layout layout = shape.b == 1 ? Layout::HWC : Layout::BHWC;
+      if (graph.IsGraphInput(t->id) || graph.IsGraphOutput(t->id)) {
+        if (shape.c < 4 &&
+            CanCreateTensorWithShape(
+                gpu_info, shape,
+                TensorDescriptor{data_type,
+                                 TensorStorageType::SINGLE_TEXTURE_2D, layout})
+                .ok()) {
+          storage_type = TensorStorageType::SINGLE_TEXTURE_2D;
+        }
+      }
+      RETURN_IF_ERROR(SelectBestStorageType(gpu_info, shape, storage_type,
+                                            data_type, layout, &storage_type));
+      tensor_desc = TensorDescriptor{data_type, storage_type, layout};
     }
-    RETURN_IF_ERROR(SelectBestStorageType(gpu_info, shape, storage_type,
-                                          data_type, layout, &storage_type));
-    TensorDescriptor tensor_desc{data_type, storage_type, layout};
     tensor_desc.shape = BHWDC(shape.b, shape.h, shape.w, 1, shape.c);
     tensor_reserver->Add(t->id, tensor_desc);
     max_id = std::max(max_id, t->id);
@@ -403,7 +457,9 @@ absl::Status ResolvePolymorphicArgs(InferenceContext::GpuModel* gpu_model) {
   class DummySpatialTensor : public GpuSpatialTensor {
    public:
     DummySpatialTensor() = default;
-    explicit DummySpatialTensor(const BHWDC& shape) : shape_(shape) {}
+    explicit DummySpatialTensor(const BHWDC& shape,
+                                const TensorDescriptor& tensor_desc)
+        : shape_(shape), tensor_desc_(tensor_desc) {}
     ~DummySpatialTensor() override = default;
 
     int Width() const override { return shape_.w; }
@@ -413,21 +469,24 @@ absl::Status ResolvePolymorphicArgs(InferenceContext::GpuModel* gpu_model) {
     int Slices() const override { return DivideRoundUp(shape_.c, 4); }
     int Batch() const override { return shape_.b; }
 
+    TensorDescriptor GetDescriptor() const override { return tensor_desc_; }
+
    private:
     BHWDC shape_;
+    TensorDescriptor tensor_desc_;
   };
 
   for (auto& node : gpu_model->nodes) {
     std::vector<DummySpatialTensor> src_tensors(node.inputs.size());
     for (int i = 0; i < node.inputs.size(); ++i) {
       const auto& tensor_desc = gpu_model->tensors[node.inputs[i]];
-      src_tensors[i] = DummySpatialTensor(tensor_desc.shape);
+      src_tensors[i] = DummySpatialTensor(tensor_desc.shape, tensor_desc);
       node.gpu_operation->SetSrc(&src_tensors[i], i);
     }
     std::vector<DummySpatialTensor> dst_tensors(node.outputs.size());
     for (int i = 0; i < node.outputs.size(); ++i) {
       const auto& tensor_desc = gpu_model->tensors[node.outputs[i]];
-      dst_tensors[i] = DummySpatialTensor(tensor_desc.shape);
+      dst_tensors[i] = DummySpatialTensor(tensor_desc.shape, tensor_desc);
       node.gpu_operation->SetDst(&dst_tensors[i], i);
     }
     RETURN_IF_ERROR(
@@ -468,6 +527,8 @@ void InferenceContext::ExecutionHints::Init(const GpuInfo& gpu_info) {
   }
   if (gpu_info.IsPowerVR()) {
     need_flush = true;
+    flush_periodically = true;
+    flush_period = 16;
   }
 }
 
@@ -502,6 +563,13 @@ absl::Status InferenceContext::InitFromGraph(
   creation_context.context = &env->context();
   creation_context.queue = env->queue();
   creation_context.cache = env->program_cache();
+  for (const auto& external_tensor : create_info.external_immutable_tensors) {
+    auto* cl_spatial_tensor = dynamic_cast<Tensor*>(external_tensor.second);
+    if (!cl_spatial_tensor) {
+      return absl::InvalidArgumentError("Expected CLSpatialTensor.");
+    }
+    external_immutable_tensors_[external_tensor.first] = cl_spatial_tensor;
+  }
   execution_hints_.Init(env->device().GetInfo());
   RETURN_IF_ERROR(
       AllocateMemory(creation_context.GetGpuInfo(), creation_context.context));
@@ -525,6 +593,8 @@ absl::Status InferenceContext::InitFromGraph(
       Tune(tuning_type, env->device().GetInfo(), env->profiling_queue()));
   InitRecordableQueue(env);
 
+  gpu_info_ = env->device().GetInfo();
+
   if (serialized_model) {
     for (auto& node : nodes_) {
       node.cl_operation.MoveObjectRefsFromCLToGeneric();
@@ -553,7 +623,8 @@ absl::Status InferenceContext::InitFromGraph(
 }
 
 absl::Status InferenceContext::RestoreDeserialized(
-    const absl::Span<const uint8_t> serialized_model, Environment* env) {
+    const absl::Span<const uint8_t> serialized_model, Environment* env,
+    CreateInferenceInfo* create_info) {
   flatbuffers::Verifier verifier(serialized_model.data(),
                                  serialized_model.size());
   if (!data::VerifyInferenceContextBuffer(verifier)) {
@@ -568,6 +639,16 @@ absl::Status InferenceContext::RestoreDeserialized(
   creation_context.context = &env->context();
   creation_context.queue = env->queue();
   creation_context.cache = env->program_cache();
+  if (create_info) {
+    for (const auto& external_tensor :
+         create_info->external_immutable_tensors) {
+      auto* cl_spatial_tensor = dynamic_cast<Tensor*>(external_tensor.second);
+      if (!cl_spatial_tensor) {
+        return absl::InvalidArgumentError("Expected CLSpatialTensor.");
+      }
+      external_immutable_tensors_[external_tensor.first] = cl_spatial_tensor;
+    }
+  }
 
   execution_hints_.Init(env->device().GetInfo());
 
@@ -623,7 +704,10 @@ void InferenceContext::GetUsages(const std::function<bool(ValueId)>& functor,
 
 InferenceContext::TensorMemoryType InferenceContext::GetTensorMemoryType(
     const GpuInfo& gpu_info, ValueId id) {
-  if (const_tensors_.find(id) != const_tensors_.end()) {
+  if (external_immutable_tensors_.find(id) !=
+      external_immutable_tensors_.end()) {
+    return TensorMemoryType::kExternal;
+  } else if (const_tensors_.find(id) != const_tensors_.end()) {
     return TensorMemoryType::kConst;
   } else if (variable_ids_and_refs_.find(id) != variable_ids_and_refs_.end()) {
     return TensorMemoryType::kVariable;
@@ -905,8 +989,8 @@ absl::Status InferenceContext::AddToQueue(CLCommandQueue* queue) {
   return absl::OkStatus();
 }
 
-absl::Status InferenceContext::Profile(ProfilingCommandQueue* queue,
-                                       ProfilingInfo* result) {
+absl::Status InferenceContext::ProfileTime(ProfilingCommandQueue* queue,
+                                           ProfilingInfo* result) {
   queue->ResetMeasurements();
   for (auto& node : nodes_) {
     queue->SetEventsLabel(node.name);
@@ -914,6 +998,71 @@ absl::Status InferenceContext::Profile(ProfilingCommandQueue* queue,
   }
   RETURN_IF_ERROR(queue->WaitForCompletion());
   *result = queue->GetProfilingInfo();
+
+  if (!(gpu_info_.IsMali() || gpu_info_.IsPowerVR())) {
+    return absl::OkStatus();
+  }
+
+  if (gpu_info_.IsMali()) {
+    queue->ResetMeasurements();
+    for (int i = 0; i < nodes_.size(); ++i) {
+      queue->SetEventsLabel(nodes_[i].name);
+      const double times =
+          16.0 / absl::ToDoubleMilliseconds(result->dispatches[i].duration);
+      const int n = std::min(256.0, std::max(2.0, times));
+      RETURN_IF_ERROR(nodes_[i].cl_operation.AddToQueueNTimes(queue, n));
+    }
+    RETURN_IF_ERROR(queue->WaitForCompletion());
+    *result = queue->GetProfilingInfo();
+    return absl::OkStatus();
+  }
+
+  if (gpu_info_.IsPowerVR()) {
+    queue->ResetMeasurements();
+    for (int i = 0; i < nodes_.size(); ++i) {
+      queue->SetEventsLabel(nodes_[i].name);
+      const double times =
+          32.0 / absl::ToDoubleMilliseconds(result->dispatches[i].duration);
+      const int n = std::min(64.0, std::max(4.0, times));
+      RETURN_IF_ERROR(nodes_[i].cl_operation.AddToQueueNTimes(queue, n));
+    }
+    RETURN_IF_ERROR(queue->WaitForCompletion());
+    *result = queue->GetProfilingInfo();
+
+    queue->ResetMeasurements();
+    for (int i = 0; i < nodes_.size(); ++i) {
+      queue->SetEventsLabel(nodes_[i].name);
+      const double times =
+          128.0 / absl::ToDoubleMilliseconds(result->dispatches[i].duration);
+      const int n = std::min(1024.0, std::max(4.0, times));
+      RETURN_IF_ERROR(nodes_[i].cl_operation.AddToQueueNTimes(queue, n));
+    }
+    RETURN_IF_ERROR(queue->WaitForCompletion());
+    *result = queue->GetProfilingInfo();
+    return absl::OkStatus();
+  }
+
+  return absl::OkStatus();
+}
+
+absl::Status InferenceContext::Profile(ProfilingCommandQueue* queue,
+                                       ProfilingInfo* result) {
+  RETURN_IF_ERROR(ProfileTime(queue, result));
+  for (int i = 0; i < nodes_.size(); ++i) {
+    uint64_t read_size = 0;
+    for (auto& src_id : nodes_[i].inputs) {
+      read_size += GetTensor(src_id)->GetMemorySizeInBytes();
+    }
+    uint64_t write_size = 0;
+    for (auto& dst_id : nodes_[i].outputs) {
+      write_size += GetTensor(dst_id)->GetMemorySizeInBytes();
+    }
+    result->dispatches[i].read_mem_size = read_size;
+    result->dispatches[i].write_mem_size = write_size;
+    const auto& gpu_op = nodes_[i].cl_operation.GetGpuOperation();
+    result->dispatches[i].flops = gpu_op.flops_;
+  }
+
   return absl::OkStatus();
 }
 
@@ -939,7 +1088,10 @@ uint64_t InferenceContext::GetSizeOfMemoryAllocatedForIntermediateTensors()
 }
 
 Tensor* InferenceContext::GetTensor(ValueId id) {
-  if (const_tensors_.find(id) != const_tensors_.end()) {
+  if (external_immutable_tensors_.find(id) !=
+      external_immutable_tensors_.end()) {
+    return external_immutable_tensors_[id];
+  } else if (const_tensors_.find(id) != const_tensors_.end()) {
     return &const_tensors_[id];
   } else if (variable_ids_and_refs_.find(id) != variable_ids_and_refs_.end()) {
     return &variable_tensors_[variable_ids_and_refs_[id]];
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.h b/tensorflow/lite/delegates/gpu/cl/inference_context.h
index bc5513f17ad900..a5b4d1ec7e809a 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.h
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.h
@@ -79,6 +79,19 @@ class InferenceContext {
     CalculationsPrecision precision;
     TensorStorageType storage_type;
     ModelHints hints;
+
+    // User can provide immutable external tensors for inference context.
+    // Some restrictions apply:
+    //   1) ValueId must be input or output id of GraphFloat32
+    //   2) Provided ptrs must be valid during life of InferenceContext.
+    //   3) data_type must be equal to DeduceDataTypeFromPrecision(precision);
+    //      for example for precision F16, data_type must be FLOAT16
+    //   4) Layout must be without Batch dimension if tensor.shape.b == 1
+    //      Layout must be with Batch dimension if tensor.shape.b != 1
+    // InitFromGraph will fail if gpu can not allocate tensor with requested
+    // tensor descriptor
+    // WARNING: This is an experimental API and subject to change.
+    absl::flat_hash_map<ValueId, GpuSpatialTensor*> external_immutable_tensors;
   };
 
   struct GpuModel {
@@ -120,10 +133,17 @@ class InferenceContext {
   const std::vector<ValueId>& GetOutputIds() const { return output_ids_; }
 
   absl::Status RestoreDeserialized(
-      const absl::Span<const uint8_t> serialized_model, Environment* env);
+      const absl::Span<const uint8_t> serialized_model, Environment* env,
+      CreateInferenceInfo* create_info = nullptr);
 
  private:
-  enum class TensorMemoryType { kStrongShape, kBuffer, kVariable, kConst };
+  enum class TensorMemoryType {
+    kStrongShape,
+    kBuffer,
+    kVariable,
+    kConst,
+    kExternal
+  };
 
   friend flatbuffers::Offset<data::InferenceContext> Encode(
       const CLDevice& device, const InferenceContext& inference,
@@ -162,6 +182,8 @@ class InferenceContext {
 
   void ReleaseCPURepresentation();
 
+  absl::Status ProfileTime(ProfilingCommandQueue* queue, ProfilingInfo* result);
+
   struct ExecutionHints {
     bool need_flush = false;
 
@@ -186,6 +208,7 @@ class InferenceContext {
   //  anywhere.
   std::vector<CLNode> nodes_;
 
+  absl::flat_hash_map<ValueId, Tensor*> external_immutable_tensors_;
   absl::flat_hash_map<ValueId, TensorDescriptor> tensors_descs_;
   absl::flat_hash_map<ValueId, TensorDescriptor> const_tensors_descs_;
   std::map<ValueId, Tensor> const_tensors_;
@@ -205,6 +228,8 @@ class InferenceContext {
   std::vector<ValueId> output_ids_;
 
   std::unique_ptr<RecordableQueue> recordable_queue_;
+
+  GpuInfo gpu_info_;
 };
 
 // Runs OpenCL specific transforms for the graph.
diff --git a/tensorflow/lite/delegates/gpu/cl/serialization.cc b/tensorflow/lite/delegates/gpu/cl/serialization.cc
index 47503eb0d54369..d80d7e33e8ad4e 100644
--- a/tensorflow/lite/delegates/gpu/cl/serialization.cc
+++ b/tensorflow/lite/delegates/gpu/cl/serialization.cc
@@ -823,6 +823,7 @@ absl::Status Decode(const data::GPUOperation* fb_op, GPUOperation* op) {
   op->elementwise_ = fb_op->elementwise();
   op->linkable_ = fb_op->linkable();
   op->check_src_channels_size_ = fb_op->check_src_channels_size();
+  op->flops_ = fb_op->flops();
   Decode(fb_op->definition(), &op->definition_);
   op->grid_dimension_ = fb_op->grid_dimension();
   op->work_group_launch_order_.x = fb_op->work_group_launch_order()->x();
@@ -876,6 +877,7 @@ flatbuffers::Offset<data::GPUOperation> Encode(
   op_builder.add_elementwise(op.elementwise_);
   op_builder.add_linkable(op.linkable_);
   op_builder.add_check_src_channels_size(op.check_src_channels_size_);
+  op_builder.add_flops(op.flops_);
   op_builder.add_definition(def_fb);
   op_builder.add_grid_dimension(op.grid_dimension_);
   op_builder.add_work_group_launch_order(work_group_launch_order_fb);
@@ -905,7 +907,7 @@ void Decode(const data::TensorDescWithId* fb_desc, TensorDescriptor* desc,
   *id = fb_desc->id();
 }
 
-flatbuffers::Offset<data::CLNode> Encode(
+flatbuffers::Offset<data::GpuNode> Encode(
     const CLNode& node, flatbuffers::FlatBufferBuilder* builder) {
   auto op_fb = Encode(node.cl_operation.GetGpuOperation(), builder);
   std::vector<int32_t> in_ids(node.inputs.size());
@@ -919,22 +921,18 @@ flatbuffers::Offset<data::CLNode> Encode(
   auto in_ids_fb = builder->CreateVector(in_ids);
   auto out_ids_fb = builder->CreateVector(out_ids);
   auto name_fb = builder->CreateString(node.name);
-  data::CLNodeBuilder node_builder(*builder);
+  data::GpuNodeBuilder node_builder(*builder);
   node_builder.add_gpu_op(op_fb);
-  node_builder.add_fingerprint(node.cl_operation.GetKernelFingerprint());
   node_builder.add_input_ids(in_ids_fb);
   node_builder.add_output_ids(out_ids_fb);
   node_builder.add_name(name_fb);
   return node_builder.Finish();
 }
 
-absl::Status Decode(const ProgramCache& program_cache,
-                    const data::CLNode* fb_node, CLNode* node) {
+absl::Status Decode(const data::GpuNode* fb_node, CLNode* node) {
   GPUOperation op;
   RETURN_IF_ERROR(Decode(fb_node->gpu_op(), &op));
   node->cl_operation.Init(absl::make_unique<GPUOperation>(std::move(op)));
-  RETURN_IF_ERROR(
-      node->cl_operation.InitFromCache(fb_node->fingerprint(), program_cache));
   for (auto in_fb : *fb_node->input_ids()) {
     node->inputs.push_back(in_fb);
   }
@@ -964,12 +962,27 @@ flatbuffers::Offset<data::InferenceContext> Encode(
   auto in_refs_fb = builder->CreateVector(in_refs);
   auto out_refs_fb = builder->CreateVector(out_refs);
 
-  std::vector<flatbuffers::Offset<data::CLNode>> nodes_fb;
+  std::vector<flatbuffers::Offset<data::GpuNode>> nodes_fb;
   for (int i = 0; i < inference.nodes_.size(); ++i) {
     auto node_fb = Encode(inference.nodes_[i], builder);
     nodes_fb.push_back(node_fb);
   }
   auto nodes_fb_vec = builder->CreateVector(nodes_fb);
+
+  std::vector<flatbuffers::Offset<tflite::gpu::data::Int3>> work_groups_fb;
+  for (int i = 0; i < inference.nodes_.size(); ++i) {
+    auto work_group_fb =
+        Encode(inference.nodes_[i].cl_operation.GetWorkGroupSize(), builder);
+    work_groups_fb.push_back(work_group_fb);
+  }
+  auto work_groups_fb_vec = builder->CreateVector(work_groups_fb);
+  std::vector<uint64_t> node_fingerprints(inference.nodes_.size());
+  for (int i = 0; i < inference.nodes_.size(); ++i) {
+    node_fingerprints[i] =
+        inference.nodes_[i].cl_operation.GetKernelFingerprint();
+  }
+  auto node_fingerprints_fb = builder->CreateVector(node_fingerprints);
+
   std::set<uint64_t> fingerprints;
   for (const auto& node : inference.nodes_) {
     fingerprints.insert(node.cl_operation.GetKernelFingerprint());
@@ -1023,6 +1036,8 @@ flatbuffers::Offset<data::InferenceContext> Encode(
   inf_builder.add_variable_ids_and_refs(variable_ids_and_refs_fb_vec);
   inf_builder.add_input_refs(in_refs_fb);
   inf_builder.add_output_refs(out_refs_fb);
+  inf_builder.add_tuned_work_group_sizes_per_node(work_groups_fb_vec);
+  inf_builder.add_fingerprints_per_node(node_fingerprints_fb);
   return inf_builder.Finish();
 }
 
@@ -1048,10 +1063,20 @@ absl::Status Decode(const CLContext& context, const CLDevice& device,
   inference->nodes_.resize(fb_inference->nodes()->size());
   int counter = 0;
   for (auto node_fb : *fb_inference->nodes()) {
-    RETURN_IF_ERROR(
-        Decode(*program_cache, node_fb, &inference->nodes_[counter]));
+    RETURN_IF_ERROR(Decode(node_fb, &inference->nodes_[counter]));
     counter++;
   }
+  for (int i = 0; i < inference->nodes_.size(); ++i) {
+    uint64_t fingerprint = (*fb_inference->fingerprints_per_node())[i];
+    RETURN_IF_ERROR(inference->nodes_[i].cl_operation.InitFromCache(
+        fingerprint, *program_cache));
+
+    int3 wg_size;
+    wg_size.x = (*fb_inference->tuned_work_group_sizes_per_node())[i]->x();
+    wg_size.y = (*fb_inference->tuned_work_group_sizes_per_node())[i]->y();
+    wg_size.z = (*fb_inference->tuned_work_group_sizes_per_node())[i]->z();
+    inference->nodes_[i].cl_operation.SetWorkGroupSize(wg_size);
+  }
 
   for (const auto& tensor_fb : *fb_inference->tensors()) {
     TensorDescriptor desc;
diff --git a/tensorflow/lite/delegates/gpu/cl/serialization.fbs b/tensorflow/lite/delegates/gpu/cl/serialization.fbs
index 671ad5f2c092b3..a1f6499cd1c01c 100644
--- a/tensorflow/lite/delegates/gpu/cl/serialization.fbs
+++ b/tensorflow/lite/delegates/gpu/cl/serialization.fbs
@@ -21,19 +21,18 @@ table TensorDescWithId {
   id:int32;
 }
 
-table CLNode {
+table PairOfValueIds {
+  first:int32;
+  second:int32;
+}
+
+table GpuNode {
   gpu_op:tflite.gpu.data.GPUOperation;
-  fingerprint:uint64;
   input_ids:[int32];
   output_ids:[int32];
   name:string;
 }
 
-table PairOfValueIds {
-  first:int32;
-  second:int32;
-}
-
 table BinaryProgram {
   fingerprint:uint64;
   binary:[ubyte];
@@ -42,7 +41,7 @@ table BinaryProgram {
 table InferenceContext {
   driver_version:string;
   binary_programs:[BinaryProgram];
-  nodes:[CLNode];
+  nodes:[GpuNode];
   tensors:[TensorDescWithId];
   const_tensors:[TensorDescWithId];
   input_ids:[int32];
@@ -50,6 +49,10 @@ table InferenceContext {
   output_ids:[int32];
   input_refs:[int64];
   output_refs:[int64];
+  // Must be serialized after actual OpenCL objects created
+  // Separated from nodes in GpuModel
+  tuned_work_group_sizes_per_node:[tflite.gpu.data.Int3];
+  fingerprints_per_node:[uint64];
 }
 
 root_type InferenceContext;
diff --git a/tensorflow/lite/delegates/gpu/cl/serialization_generated.h b/tensorflow/lite/delegates/gpu/cl/serialization_generated.h
index cce3970afbeb03..d5acd7e1eae114 100644
--- a/tensorflow/lite/delegates/gpu/cl/serialization_generated.h
+++ b/tensorflow/lite/delegates/gpu/cl/serialization_generated.h
@@ -30,12 +30,12 @@ namespace data {
 struct TensorDescWithId;
 struct TensorDescWithIdBuilder;
 
-struct CLNode;
-struct CLNodeBuilder;
-
 struct PairOfValueIds;
 struct PairOfValueIdsBuilder;
 
+struct GpuNode;
+struct GpuNodeBuilder;
+
 struct BinaryProgram;
 struct BinaryProgramBuilder;
 
@@ -94,19 +94,62 @@ inline flatbuffers::Offset<TensorDescWithId> CreateTensorDescWithId(
   return builder_.Finish();
 }
 
-struct CLNode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
-  typedef CLNodeBuilder Builder;
+struct PairOfValueIds FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef PairOfValueIdsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_FIRST = 4,
+    VT_SECOND = 6
+  };
+  int32_t first() const { return GetField<int32_t>(VT_FIRST, 0); }
+  int32_t second() const { return GetField<int32_t>(VT_SECOND, 0); }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_FIRST) &&
+           VerifyField<int32_t>(verifier, VT_SECOND) && verifier.EndTable();
+  }
+};
+
+struct PairOfValueIdsBuilder {
+  typedef PairOfValueIds Table;
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_first(int32_t first) {
+    fbb_.AddElement<int32_t>(PairOfValueIds::VT_FIRST, first, 0);
+  }
+  void add_second(int32_t second) {
+    fbb_.AddElement<int32_t>(PairOfValueIds::VT_SECOND, second, 0);
+  }
+  explicit PairOfValueIdsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+      : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  flatbuffers::Offset<PairOfValueIds> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<PairOfValueIds>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<PairOfValueIds> CreatePairOfValueIds(
+    flatbuffers::FlatBufferBuilder &_fbb, int32_t first = 0,
+    int32_t second = 0) {
+  PairOfValueIdsBuilder builder_(_fbb);
+  builder_.add_second(second);
+  builder_.add_first(first);
+  return builder_.Finish();
+}
+
+struct GpuNode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef GpuNodeBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
     VT_GPU_OP = 4,
-    VT_FINGERPRINT = 6,
-    VT_INPUT_IDS = 8,
-    VT_OUTPUT_IDS = 10,
-    VT_NAME = 12
+    VT_INPUT_IDS = 6,
+    VT_OUTPUT_IDS = 8,
+    VT_NAME = 10
   };
   const tflite::gpu::data::GPUOperation *gpu_op() const {
     return GetPointer<const tflite::gpu::data::GPUOperation *>(VT_GPU_OP);
   }
-  uint64_t fingerprint() const { return GetField<uint64_t>(VT_FINGERPRINT, 0); }
   const flatbuffers::Vector<int32_t> *input_ids() const {
     return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_INPUT_IDS);
   }
@@ -119,7 +162,6 @@ struct CLNode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_GPU_OP) &&
            verifier.VerifyTable(gpu_op()) &&
-           VerifyField<uint64_t>(verifier, VT_FINGERPRINT) &&
            VerifyOffset(verifier, VT_INPUT_IDS) &&
            verifier.VerifyVector(input_ids()) &&
            VerifyOffset(verifier, VT_OUTPUT_IDS) &&
@@ -129,45 +171,39 @@ struct CLNode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   }
 };
 
-struct CLNodeBuilder {
-  typedef CLNode Table;
+struct GpuNodeBuilder {
+  typedef GpuNode Table;
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
   void add_gpu_op(flatbuffers::Offset<tflite::gpu::data::GPUOperation> gpu_op) {
-    fbb_.AddOffset(CLNode::VT_GPU_OP, gpu_op);
-  }
-  void add_fingerprint(uint64_t fingerprint) {
-    fbb_.AddElement<uint64_t>(CLNode::VT_FINGERPRINT, fingerprint, 0);
+    fbb_.AddOffset(GpuNode::VT_GPU_OP, gpu_op);
   }
   void add_input_ids(flatbuffers::Offset<flatbuffers::Vector<int32_t>> input_ids) {
-    fbb_.AddOffset(CLNode::VT_INPUT_IDS, input_ids);
+    fbb_.AddOffset(GpuNode::VT_INPUT_IDS, input_ids);
   }
   void add_output_ids(flatbuffers::Offset<flatbuffers::Vector<int32_t>> output_ids) {
-    fbb_.AddOffset(CLNode::VT_OUTPUT_IDS, output_ids);
+    fbb_.AddOffset(GpuNode::VT_OUTPUT_IDS, output_ids);
   }
   void add_name(flatbuffers::Offset<flatbuffers::String> name) {
-    fbb_.AddOffset(CLNode::VT_NAME, name);
+    fbb_.AddOffset(GpuNode::VT_NAME, name);
   }
-  explicit CLNodeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
+  explicit GpuNodeBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  flatbuffers::Offset<CLNode> Finish() {
+  flatbuffers::Offset<GpuNode> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<CLNode>(end);
+    auto o = flatbuffers::Offset<GpuNode>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<CLNode> CreateCLNode(
+inline flatbuffers::Offset<GpuNode> CreateGpuNode(
     flatbuffers::FlatBufferBuilder &_fbb,
     flatbuffers::Offset<tflite::gpu::data::GPUOperation> gpu_op = 0,
-    uint64_t fingerprint = 0,
     flatbuffers::Offset<flatbuffers::Vector<int32_t>> input_ids = 0,
     flatbuffers::Offset<flatbuffers::Vector<int32_t>> output_ids = 0,
     flatbuffers::Offset<flatbuffers::String> name = 0) {
-  CLNodeBuilder builder_(_fbb);
-  builder_.add_fingerprint(fingerprint);
+  GpuNodeBuilder builder_(_fbb);
   builder_.add_name(name);
   builder_.add_output_ids(output_ids);
   builder_.add_input_ids(input_ids);
@@ -175,68 +211,17 @@ inline flatbuffers::Offset<CLNode> CreateCLNode(
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<CLNode> CreateCLNodeDirect(
+inline flatbuffers::Offset<GpuNode> CreateGpuNodeDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
     flatbuffers::Offset<tflite::gpu::data::GPUOperation> gpu_op = 0,
-    uint64_t fingerprint = 0, const std::vector<int32_t> *input_ids = nullptr,
+    const std::vector<int32_t> *input_ids = nullptr,
     const std::vector<int32_t> *output_ids = nullptr,
     const char *name = nullptr) {
   auto input_ids__ = input_ids ? _fbb.CreateVector<int32_t>(*input_ids) : 0;
   auto output_ids__ = output_ids ? _fbb.CreateVector<int32_t>(*output_ids) : 0;
   auto name__ = name ? _fbb.CreateString(name) : 0;
-  return tflite::gpu::cl::data::CreateCLNode(_fbb, gpu_op, fingerprint,
-                                             input_ids__, output_ids__, name__);
-}
-
-struct PairOfValueIds FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
-  typedef PairOfValueIdsBuilder Builder;
-  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
-    VT_FIRST = 4,
-    VT_SECOND = 6
-  };
-  int32_t first() const {
-    return GetField<int32_t>(VT_FIRST, 0);
-  }
-  int32_t second() const {
-    return GetField<int32_t>(VT_SECOND, 0);
-  }
-  bool Verify(flatbuffers::Verifier &verifier) const {
-    return VerifyTableStart(verifier) &&
-           VerifyField<int32_t>(verifier, VT_FIRST) &&
-           VerifyField<int32_t>(verifier, VT_SECOND) &&
-           verifier.EndTable();
-  }
-};
-
-struct PairOfValueIdsBuilder {
-  typedef PairOfValueIds Table;
-  flatbuffers::FlatBufferBuilder &fbb_;
-  flatbuffers::uoffset_t start_;
-  void add_first(int32_t first) {
-    fbb_.AddElement<int32_t>(PairOfValueIds::VT_FIRST, first, 0);
-  }
-  void add_second(int32_t second) {
-    fbb_.AddElement<int32_t>(PairOfValueIds::VT_SECOND, second, 0);
-  }
-  explicit PairOfValueIdsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
-        : fbb_(_fbb) {
-    start_ = fbb_.StartTable();
-  }
-  flatbuffers::Offset<PairOfValueIds> Finish() {
-    const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<PairOfValueIds>(end);
-    return o;
-  }
-};
-
-inline flatbuffers::Offset<PairOfValueIds> CreatePairOfValueIds(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    int32_t first = 0,
-    int32_t second = 0) {
-  PairOfValueIdsBuilder builder_(_fbb);
-  builder_.add_second(second);
-  builder_.add_first(first);
-  return builder_.Finish();
+  return tflite::gpu::cl::data::CreateGpuNode(_fbb, gpu_op, input_ids__,
+                                              output_ids__, name__);
 }
 
 struct BinaryProgram FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
@@ -307,7 +292,9 @@ struct InferenceContext FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_VARIABLE_IDS_AND_REFS = 16,
     VT_OUTPUT_IDS = 18,
     VT_INPUT_REFS = 20,
-    VT_OUTPUT_REFS = 22
+    VT_OUTPUT_REFS = 22,
+    VT_TUNED_WORK_GROUP_SIZES_PER_NODE = 24,
+    VT_FINGERPRINTS_PER_NODE = 26
   };
   const flatbuffers::String *driver_version() const {
     return GetPointer<const flatbuffers::String *>(VT_DRIVER_VERSION);
@@ -319,8 +306,10 @@ struct InferenceContext FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
         flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>> *>(
         VT_BINARY_PROGRAMS);
   }
-  const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::cl::data::CLNode>> *nodes() const {
-    return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::cl::data::CLNode>> *>(VT_NODES);
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::cl::data::GpuNode>>
+      *nodes() const {
+    return GetPointer<const flatbuffers::Vector<
+        flatbuffers::Offset<tflite::gpu::cl::data::GpuNode>> *>(VT_NODES);
   }
   const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::cl::data::TensorDescWithId>> *tensors() const {
     return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::cl::data::TensorDescWithId>> *>(VT_TENSORS);
@@ -347,6 +336,16 @@ struct InferenceContext FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const flatbuffers::Vector<int64_t> *output_refs() const {
     return GetPointer<const flatbuffers::Vector<int64_t> *>(VT_OUTPUT_REFS);
   }
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::Int3>>
+      *tuned_work_group_sizes_per_node() const {
+    return GetPointer<const flatbuffers::Vector<
+        flatbuffers::Offset<tflite::gpu::data::Int3>> *>(
+        VT_TUNED_WORK_GROUP_SIZES_PER_NODE);
+  }
+  const flatbuffers::Vector<uint64_t> *fingerprints_per_node() const {
+    return GetPointer<const flatbuffers::Vector<uint64_t> *>(
+        VT_FINGERPRINTS_PER_NODE);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyOffset(verifier, VT_DRIVER_VERSION) &&
@@ -372,7 +371,13 @@ struct InferenceContext FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyOffset(verifier, VT_INPUT_REFS) &&
            verifier.VerifyVector(input_refs()) &&
            VerifyOffset(verifier, VT_OUTPUT_REFS) &&
-           verifier.VerifyVector(output_refs()) && verifier.EndTable();
+           verifier.VerifyVector(output_refs()) &&
+           VerifyOffset(verifier, VT_TUNED_WORK_GROUP_SIZES_PER_NODE) &&
+           verifier.VerifyVector(tuned_work_group_sizes_per_node()) &&
+           verifier.VerifyVectorOfTables(tuned_work_group_sizes_per_node()) &&
+           VerifyOffset(verifier, VT_FINGERPRINTS_PER_NODE) &&
+           verifier.VerifyVector(fingerprints_per_node()) &&
+           verifier.EndTable();
   }
 };
 
@@ -390,7 +395,9 @@ struct InferenceContextBuilder {
           binary_programs) {
     fbb_.AddOffset(InferenceContext::VT_BINARY_PROGRAMS, binary_programs);
   }
-  void add_nodes(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::cl::data::CLNode>>> nodes) {
+  void add_nodes(flatbuffers::Offset<flatbuffers::Vector<
+                     flatbuffers::Offset<tflite::gpu::cl::data::GpuNode>>>
+                     nodes) {
     fbb_.AddOffset(InferenceContext::VT_NODES, nodes);
   }
   void add_tensors(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::cl::data::TensorDescWithId>>> tensors) {
@@ -417,6 +424,19 @@ struct InferenceContextBuilder {
   void add_output_refs(flatbuffers::Offset<flatbuffers::Vector<int64_t>> output_refs) {
     fbb_.AddOffset(InferenceContext::VT_OUTPUT_REFS, output_refs);
   }
+  void add_tuned_work_group_sizes_per_node(
+      flatbuffers::Offset<
+          flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::Int3>>>
+          tuned_work_group_sizes_per_node) {
+    fbb_.AddOffset(InferenceContext::VT_TUNED_WORK_GROUP_SIZES_PER_NODE,
+                   tuned_work_group_sizes_per_node);
+  }
+  void add_fingerprints_per_node(
+      flatbuffers::Offset<flatbuffers::Vector<uint64_t>>
+          fingerprints_per_node) {
+    fbb_.AddOffset(InferenceContext::VT_FINGERPRINTS_PER_NODE,
+                   fingerprints_per_node);
+  }
   explicit InferenceContextBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -434,8 +454,8 @@ inline flatbuffers::Offset<InferenceContext> CreateInferenceContext(
     flatbuffers::Offset<flatbuffers::Vector<
         flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>>>
         binary_programs = 0,
-    flatbuffers::Offset<
-        flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::cl::data::CLNode>>>
+    flatbuffers::Offset<flatbuffers::Vector<
+        flatbuffers::Offset<tflite::gpu::cl::data::GpuNode>>>
         nodes = 0,
     flatbuffers::Offset<flatbuffers::Vector<
         flatbuffers::Offset<tflite::gpu::cl::data::TensorDescWithId>>>
@@ -449,8 +469,15 @@ inline flatbuffers::Offset<InferenceContext> CreateInferenceContext(
         variable_ids_and_refs = 0,
     flatbuffers::Offset<flatbuffers::Vector<int32_t>> output_ids = 0,
     flatbuffers::Offset<flatbuffers::Vector<int64_t>> input_refs = 0,
-    flatbuffers::Offset<flatbuffers::Vector<int64_t>> output_refs = 0) {
+    flatbuffers::Offset<flatbuffers::Vector<int64_t>> output_refs = 0,
+    flatbuffers::Offset<
+        flatbuffers::Vector<flatbuffers::Offset<tflite::gpu::data::Int3>>>
+        tuned_work_group_sizes_per_node = 0,
+    flatbuffers::Offset<flatbuffers::Vector<uint64_t>> fingerprints_per_node =
+        0) {
   InferenceContextBuilder builder_(_fbb);
+  builder_.add_fingerprints_per_node(fingerprints_per_node);
+  builder_.add_tuned_work_group_sizes_per_node(tuned_work_group_sizes_per_node);
   builder_.add_output_refs(output_refs);
   builder_.add_input_refs(input_refs);
   builder_.add_output_ids(output_ids);
@@ -468,7 +495,7 @@ inline flatbuffers::Offset<InferenceContext> CreateInferenceContextDirect(
     flatbuffers::FlatBufferBuilder &_fbb, const char *driver_version = nullptr,
     const std::vector<flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>>
         *binary_programs = nullptr,
-    const std::vector<flatbuffers::Offset<tflite::gpu::cl::data::CLNode>>
+    const std::vector<flatbuffers::Offset<tflite::gpu::cl::data::GpuNode>>
         *nodes = nullptr,
     const std::vector<
         flatbuffers::Offset<tflite::gpu::cl::data::TensorDescWithId>> *tensors =
@@ -481,7 +508,10 @@ inline flatbuffers::Offset<InferenceContext> CreateInferenceContextDirect(
         *variable_ids_and_refs = nullptr,
     const std::vector<int32_t> *output_ids = nullptr,
     const std::vector<int64_t> *input_refs = nullptr,
-    const std::vector<int64_t> *output_refs = nullptr) {
+    const std::vector<int64_t> *output_refs = nullptr,
+    const std::vector<flatbuffers::Offset<tflite::gpu::data::Int3>>
+        *tuned_work_group_sizes_per_node = nullptr,
+    const std::vector<uint64_t> *fingerprints_per_node = nullptr) {
   auto driver_version__ =
       driver_version ? _fbb.CreateString(driver_version) : 0;
   auto binary_programs__ =
@@ -490,7 +520,10 @@ inline flatbuffers::Offset<InferenceContext> CreateInferenceContextDirect(
                 flatbuffers::Offset<tflite::gpu::cl::data::BinaryProgram>>(
                 *binary_programs)
           : 0;
-  auto nodes__ = nodes ? _fbb.CreateVector<flatbuffers::Offset<tflite::gpu::cl::data::CLNode>>(*nodes) : 0;
+  auto nodes__ =
+      nodes ? _fbb.CreateVector<
+                  flatbuffers::Offset<tflite::gpu::cl::data::GpuNode>>(*nodes)
+            : 0;
   auto tensors__ = tensors ? _fbb.CreateVector<flatbuffers::Offset<tflite::gpu::cl::data::TensorDescWithId>>(*tensors) : 0;
   auto const_tensors__ =
       const_tensors
@@ -503,10 +536,20 @@ inline flatbuffers::Offset<InferenceContext> CreateInferenceContextDirect(
   auto output_ids__ = output_ids ? _fbb.CreateVector<int32_t>(*output_ids) : 0;
   auto input_refs__ = input_refs ? _fbb.CreateVector<int64_t>(*input_refs) : 0;
   auto output_refs__ = output_refs ? _fbb.CreateVector<int64_t>(*output_refs) : 0;
+  auto tuned_work_group_sizes_per_node__ =
+      tuned_work_group_sizes_per_node
+          ? _fbb.CreateVector<flatbuffers::Offset<tflite::gpu::data::Int3>>(
+                *tuned_work_group_sizes_per_node)
+          : 0;
+  auto fingerprints_per_node__ =
+      fingerprints_per_node
+          ? _fbb.CreateVector<uint64_t>(*fingerprints_per_node)
+          : 0;
   return tflite::gpu::cl::data::CreateInferenceContext(
       _fbb, driver_version__, binary_programs__, nodes__, tensors__,
       const_tensors__, input_ids__, variable_ids_and_refs__, output_ids__,
-      input_refs__, output_refs__);
+      input_refs__, output_refs__, tuned_work_group_sizes_per_node__,
+      fingerprints_per_node__);
 }
 
 inline const tflite::gpu::cl::data::InferenceContext *GetInferenceContext(const void *buf) {
diff --git a/tensorflow/lite/delegates/gpu/cl/tensor.h b/tensorflow/lite/delegates/gpu/cl/tensor.h
index e1887946ffb165..cdc7fa2d7f235d 100644
--- a/tensorflow/lite/delegates/gpu/cl/tensor.h
+++ b/tensorflow/lite/delegates/gpu/cl/tensor.h
@@ -68,7 +68,7 @@ class Tensor : public GPUObject, public GpuSpatialTensor {
   int Slices() const override { return DivideRoundUp(shape_.c, 4); }
   int Batch() const override { return shape_.b; }
 
-  TensorDescriptor GetDescriptor() const { return descriptor_; }
+  TensorDescriptor GetDescriptor() const override { return descriptor_; }
   DataType GetDataType() const { return descriptor_.data_type; }
   TensorStorageType GetStorageType() const { return descriptor_.storage_type; }
 
diff --git a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
index 072ead4bbfbf0f..64cc896ce75391 100644
--- a/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
+++ b/tensorflow/lite/delegates/gpu/cl/testing/performance_profiling.cc
@@ -30,6 +30,60 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 
+absl::Status RunExternalImmutableSample(const std::string& model_name) {
+  auto flatbuffer = tflite::FlatBufferModel::BuildFromFile(model_name.c_str());
+  GraphFloat32 graph_cl;
+  ops::builtin::BuiltinOpResolver op_resolver;
+  RETURN_IF_ERROR(BuildFromFlatBuffer(*flatbuffer, op_resolver, &graph_cl,
+                                      /*allow_quant_ops*/ true));
+
+  Environment env;
+  RETURN_IF_ERROR(CreateEnvironment(&env));
+
+  InferenceContext::CreateInferenceInfo create_info;
+  create_info.precision = env.IsSupported(CalculationsPrecision::F16)
+                              ? CalculationsPrecision::F16
+                              : CalculationsPrecision::F32;
+  create_info.storage_type = GetFastestStorageType(env.device().GetInfo());
+  create_info.hints.Add(ModelHints::kAllowSpecialKernels);
+  // Example of external immutable tensors:
+  std::vector<Tensor> outputs(graph_cl.outputs().size());
+  for (int i = 0; i < graph_cl.outputs().size(); ++i) {
+    // Assumed that graph outputs have batch size = 1.
+    auto data_type = DeduceDataTypeFromPrecision(create_info.precision);
+    RETURN_IF_ERROR(CreateTensor(
+        env.context(), graph_cl.outputs()[i]->tensor.shape,
+        TensorDescriptor{data_type, TensorStorageType::TEXTURE_ARRAY,
+                         Layout::HWC},
+        &outputs[i]));
+    create_info.external_immutable_tensors[graph_cl.outputs()[i]->id] =
+        &outputs[i];
+  }
+  std::cout << "Precision: " << ToString(create_info.precision) << std::endl;
+  std::cout << "Storage type: " << ToString(create_info.storage_type)
+            << std::endl;
+  InferenceContext context;
+  RETURN_IF_ERROR(
+      context.InitFromGraphWithTransforms(create_info, &graph_cl, &env));
+
+  RETURN_IF_ERROR(context.AddToQueue(env.queue()));
+
+  // outputs can be used here. But AddToQueue do not have cpu
+  // syncronization.
+  RETURN_IF_ERROR(env.queue()->WaitForCompletion());
+
+  const auto dst_shape = BHWC(outputs[0].Batch(), outputs[0].Height(),
+                              outputs[0].Width(), outputs[0].Channels());
+  TensorFloat32 cpu_tensor;
+  cpu_tensor.shape = dst_shape;
+  cpu_tensor.data.resize(dst_shape.DimensionsProduct());
+  RETURN_IF_ERROR(outputs[0].ReadData(env.queue(), &cpu_tensor));
+  std::cout << "First tensor data at index 0 - " << cpu_tensor.data[0]
+            << std::endl;
+
+  return absl::OkStatus();
+}
+
 absl::Status RunSerializedTest(const std::string& model_name) {
   auto flatbuffer = tflite::FlatBufferModel::BuildFromFile(model_name.c_str());
   GraphFloat32 graph_cl;
@@ -210,5 +264,14 @@ int main(int argc, char** argv) {
     }
   }
 
+  bool run_with_external_immutable_tensors = false;
+  if (run_with_external_immutable_tensors) {
+    run_status = tflite::gpu::cl::RunExternalImmutableSample(argv[1]);
+    if (!run_status.ok()) {
+      std::cerr << run_status.message();
+      return -1;
+    }
+  }
+
   return EXIT_SUCCESS;
 }
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_info.cc b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
index 402296385a6aad..3c14a64fefbc15 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_info.cc
+++ b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
@@ -105,20 +105,21 @@ AdrenoGpu GetAdrenoGpuVersion(const std::string& gpu_description) {
 }
 
 MaliGpu GetMaliGpuVersion(const std::string& gpu_description) {
-  const std::map<std::string, MaliGpu> kMapping = {
+  // Order must be preserved
+  const std::vector<std::pair<std::string, MaliGpu>> kMapping = {
       {"t604", MaliGpu::kT604}, {"t622", MaliGpu::kT622},
       {"t624", MaliGpu::kT624}, {"t628", MaliGpu::kT628},
       {"t658", MaliGpu::kT658}, {"t678", MaliGpu::kT678},
       {"t720", MaliGpu::kT720}, {"t760", MaliGpu::kT760},
       {"t820", MaliGpu::kT820}, {"t830", MaliGpu::kT830},
       {"t860", MaliGpu::kT860}, {"t880", MaliGpu::kT880},
-      {"g31", MaliGpu::kG31},   {"g51", MaliGpu::kG51},
-      {"g71", MaliGpu::kG71},   {"g52", MaliGpu::kG52},
+      {"g310", MaliGpu::kG310}, {"g31", MaliGpu::kG31},
+      {"g510", MaliGpu::kG510}, {"g51", MaliGpu::kG51},
+      {"g52", MaliGpu::kG52},   {"g57", MaliGpu::kG57},
+      {"g610", MaliGpu::kG610}, {"g68", MaliGpu::kG68},
+      {"g710", MaliGpu::kG710}, {"g71", MaliGpu::kG71},
       {"g72", MaliGpu::kG72},   {"g76", MaliGpu::kG76},
-      {"g57", MaliGpu::kG57},   {"g77", MaliGpu::kG77},
-      {"g68", MaliGpu::kG68},   {"g78", MaliGpu::kG78},
-      {"g310", MaliGpu::kG310}, {"g510", MaliGpu::kG510},
-      {"g610", MaliGpu::kG610}, {"g710", MaliGpu::kG710},
+      {"g77", MaliGpu::kG77},   {"g78", MaliGpu::kG78},
   };
   for (const auto& v : kMapping) {
     if (gpu_description.find(v.first) != std::string::npos) {
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index e4fe27e1263139..1f6beafe78a589 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -2664,7 +2664,7 @@ TfLiteIntArray* GetOpsToReplace(
         !IsAllAllowedTensors(context, node->outputs, allowed_out_types)) {
       if (unsupported_details) {
         *unsupported_details =
-            "OP is supported, but tensor type/shape doesn't supported.";
+            "OP is supported, but tensor type/shape isn't compatible.";
       }
       return false;
     }
diff --git a/tensorflow/lite/delegates/gpu/common/selectors/operation_selector.cc b/tensorflow/lite/delegates/gpu/common/selectors/operation_selector.cc
index 78bfe360b420e6..0781667f697f0d 100644
--- a/tensorflow/lite/delegates/gpu/common/selectors/operation_selector.cc
+++ b/tensorflow/lite/delegates/gpu/common/selectors/operation_selector.cc
@@ -330,6 +330,11 @@ absl::Status GPUOperationFromNodePart0(
           gpu_op = InitSingleOpSubgraph(inputs, outputs, gpu_subgraph);
           *gpu_op =
               SelectConvolution(attr, output_shape, gpu_info, op_def, hints);
+          uint64_t dst_elements =
+              output_shape.b * output_shape.h * output_shape.w * output_shape.c;
+          // 2 flops per element, we have for every element multiply and add
+          (*gpu_op)->flops_ = dst_elements * attr.weights.shape.i *
+                              attr.weights.shape.w * attr.weights.shape.h * 2;
           return absl::OkStatus();
         }
       } else {
diff --git a/tensorflow/lite/delegates/gpu/common/task/BUILD b/tensorflow/lite/delegates/gpu/common/task/BUILD
index 357610c22d4d21..e94b49d9b493a2 100644
--- a/tensorflow/lite/delegates/gpu/common/task/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/task/BUILD
@@ -80,6 +80,9 @@ cc_library(
 cc_library(
     name = "gpu_tensor",
     hdrs = ["gpu_tensor.h"],
+    deps = [
+        "//tensorflow/lite/delegates/gpu/common/task:tensor_desc",
+    ],
 )
 
 cc_library(
diff --git a/tensorflow/lite/delegates/gpu/common/task/gpu_operation.cc b/tensorflow/lite/delegates/gpu/common/task/gpu_operation.cc
index 68bbb2ee81f825..b828559a6724d5 100644
--- a/tensorflow/lite/delegates/gpu/common/task/gpu_operation.cc
+++ b/tensorflow/lite/delegates/gpu/common/task/gpu_operation.cc
@@ -98,6 +98,7 @@ GPUOperation::GPUOperation(GPUOperation&& operation)
       elementwise_(operation.elementwise_),
       linkable_(operation.linkable_),
       check_src_channels_size_(operation.check_src_channels_size_),
+      flops_(operation.flops_),
       definition_(std::move(operation.definition_)),
       src_(std::move(operation.src_)),
       dst_(std::move(operation.dst_)),
@@ -120,6 +121,7 @@ GPUOperation& GPUOperation::operator=(GPUOperation&& operation) {
     elementwise_ = operation.elementwise_;
     linkable_ = operation.linkable_;
     check_src_channels_size_ = operation.check_src_channels_size_;
+    flops_ = operation.flops_;
     definition_ = std::move(operation.definition_);
     src_ = std::move(operation.src_);
     dst_ = std::move(operation.dst_);
diff --git a/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h b/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h
index 263c31435e0d93..e1fe68769f722c 100644
--- a/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h
+++ b/tensorflow/lite/delegates/gpu/common/task/gpu_operation.h
@@ -160,6 +160,9 @@ class GPUOperation {
   // applicable only with elementwise_ = true;
   bool check_src_channels_size_ = false;
 
+  // for profiling
+  uint64_t flops_ = 0;
+
  protected:
   friend class cl::ClOperation;
   friend class gl::GlOperation;
diff --git a/tensorflow/lite/delegates/gpu/common/task/gpu_tensor.h b/tensorflow/lite/delegates/gpu/common/task/gpu_tensor.h
index 8a6120339c2215..c960f1af9ae25b 100644
--- a/tensorflow/lite/delegates/gpu/common/task/gpu_tensor.h
+++ b/tensorflow/lite/delegates/gpu/common/task/gpu_tensor.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_TENSOR_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_TASK_GPU_TENSOR_H_
 
+#include "tensorflow/lite/delegates/gpu/common/task/tensor_desc.h"
+
 namespace tflite {
 namespace gpu {
 
@@ -34,6 +36,8 @@ class GpuSpatialTensor {
   virtual int Channels() const = 0;
   virtual int Slices() const = 0;
   virtual int Batch() const = 0;
+
+  virtual TensorDescriptor GetDescriptor() const = 0;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/lite/delegates/gpu/common/task/profiling_info.cc b/tensorflow/lite/delegates/gpu/common/task/profiling_info.cc
index 6f655555e77c61..630290f6bc3260 100644
--- a/tensorflow/lite/delegates/gpu/common/task/profiling_info.cc
+++ b/tensorflow/lite/delegates/gpu/common/task/profiling_info.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/task/profiling_info.h"
 
 #include <map>
+#include <string>
 
 namespace tflite {
 namespace gpu {
@@ -40,7 +41,22 @@ std::string ProfilingInfo::GetDetailedReport() const {
   for (const auto& dispatch : dispatches) {
     result += "  " + dispatch.label + " - " +
               std::to_string(absl::ToDoubleMilliseconds(dispatch.duration)) +
-              " ms\n";
+              " ms";
+    const double times_per_sec =
+        1000.0 / absl::ToDoubleMilliseconds(dispatch.duration);
+    if (dispatch.read_mem_size && dispatch.write_mem_size) {
+      const uint64_t total_size =
+          dispatch.read_mem_size + dispatch.write_mem_size;
+      const double giga_bytes = total_size / 1024.0 / 1024.0 / 1024.0;
+      const double giga_bytes_per_sec = times_per_sec * giga_bytes;
+      result += ", " + std::to_string(giga_bytes_per_sec) + " Gb/s";
+    }
+    if (dispatch.flops) {
+      const double giga_flops = dispatch.flops / 1000.0 / 1000.0 / 1000.0;
+      const double giga_flops_per_sec = times_per_sec * giga_flops;
+      result += ", " + std::to_string(giga_flops_per_sec) + " Gflops";
+    }
+    result += "\n";
     auto name = dispatch.label.substr(0, dispatch.label.find(' '));
     if (statistics.find(name) != statistics.end()) {
       statistics[name].count++;
diff --git a/tensorflow/lite/delegates/gpu/common/task/profiling_info.h b/tensorflow/lite/delegates/gpu/common/task/profiling_info.h
index a22bdf72fbfdc5..3758b792f038ec 100644
--- a/tensorflow/lite/delegates/gpu/common/task/profiling_info.h
+++ b/tensorflow/lite/delegates/gpu/common/task/profiling_info.h
@@ -28,6 +28,9 @@ struct ProfilingInfo {
   struct DispatchInfo {
     std::string label;
     absl::Duration duration;
+    uint64_t read_mem_size = 0;
+    uint64_t write_mem_size = 0;
+    uint64_t flops = 0;
   };
 
   std::vector<DispatchInfo> dispatches;
diff --git a/tensorflow/lite/delegates/gpu/common/task/serialization_base.fbs b/tensorflow/lite/delegates/gpu/common/task/serialization_base.fbs
index c3fff0320270bd..906a7c9cc14577 100644
--- a/tensorflow/lite/delegates/gpu/common/task/serialization_base.fbs
+++ b/tensorflow/lite/delegates/gpu/common/task/serialization_base.fbs
@@ -235,6 +235,7 @@ table GPUOperation {
   elementwise:bool;
   linkable:bool;
   check_src_channels_size:bool;
+  flops:uint64;
   definition:OperationDef;
   grid_dimension:int32;
   work_group_launch_order:Int3;
diff --git a/tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h b/tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h
index 8c2e876521fe14..a8f7b714e804c1 100644
--- a/tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h
+++ b/tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h
@@ -2144,15 +2144,16 @@ struct GPUOperation FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_ELEMENTWISE = 14,
     VT_LINKABLE = 16,
     VT_CHECK_SRC_CHANNELS_SIZE = 18,
-    VT_DEFINITION = 20,
-    VT_GRID_DIMENSION = 22,
-    VT_WORK_GROUP_LAUNCH_ORDER = 24,
-    VT_GRID_SIZE = 26,
-    VT_SRC_TENSORS_NAMES = 28,
-    VT_DST_TENSORS_NAMES = 30,
-    VT_WORK_GROUPS_COUNT = 32,
-    VT_LINKABLE_COUNT = 34,
-    VT_ELEMENTWISE_CODE = 36
+    VT_FLOPS = 20,
+    VT_DEFINITION = 22,
+    VT_GRID_DIMENSION = 24,
+    VT_WORK_GROUP_LAUNCH_ORDER = 26,
+    VT_GRID_SIZE = 28,
+    VT_SRC_TENSORS_NAMES = 30,
+    VT_DST_TENSORS_NAMES = 32,
+    VT_WORK_GROUPS_COUNT = 34,
+    VT_LINKABLE_COUNT = 36,
+    VT_ELEMENTWISE_CODE = 38
   };
   const tflite::gpu::data::Arguments *arguments() const {
     return GetPointer<const tflite::gpu::data::Arguments *>(VT_ARGUMENTS);
@@ -2179,6 +2180,7 @@ struct GPUOperation FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   bool check_src_channels_size() const {
     return GetField<uint8_t>(VT_CHECK_SRC_CHANNELS_SIZE, 0) != 0;
   }
+  uint64_t flops() const { return GetField<uint64_t>(VT_FLOPS, 0); }
   const tflite::gpu::data::OperationDef *definition() const {
     return GetPointer<const tflite::gpu::data::OperationDef *>(VT_DEFINITION);
   }
@@ -2226,6 +2228,7 @@ struct GPUOperation FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<uint8_t>(verifier, VT_ELEMENTWISE) &&
            VerifyField<uint8_t>(verifier, VT_LINKABLE) &&
            VerifyField<uint8_t>(verifier, VT_CHECK_SRC_CHANNELS_SIZE) &&
+           VerifyField<uint64_t>(verifier, VT_FLOPS) &&
            VerifyOffset(verifier, VT_DEFINITION) &&
            verifier.VerifyTable(definition()) &&
            VerifyField<int32_t>(verifier, VT_GRID_DIMENSION) &&
@@ -2284,6 +2287,9 @@ struct GPUOperationBuilder {
     fbb_.AddElement<uint8_t>(GPUOperation::VT_CHECK_SRC_CHANNELS_SIZE,
                              static_cast<uint8_t>(check_src_channels_size), 0);
   }
+  void add_flops(uint64_t flops) {
+    fbb_.AddElement<uint64_t>(GPUOperation::VT_FLOPS, flops, 0);
+  }
   void add_definition(
       flatbuffers::Offset<tflite::gpu::data::OperationDef> definition) {
     fbb_.AddOffset(GPUOperation::VT_DEFINITION, definition);
@@ -2346,7 +2352,7 @@ inline flatbuffers::Offset<GPUOperation> CreateGPUOperation(
     tflite::gpu::data::TensorToGrid tensor_to_grid =
         tflite::gpu::data::TensorToGrid::CUSTOM,
     bool elementwise = false, bool linkable = false,
-    bool check_src_channels_size = false,
+    bool check_src_channels_size = false, uint64_t flops = 0,
     flatbuffers::Offset<tflite::gpu::data::OperationDef> definition = 0,
     int32_t grid_dimension = 0,
     flatbuffers::Offset<tflite::gpu::data::Int3> work_group_launch_order = 0,
@@ -2361,6 +2367,7 @@ inline flatbuffers::Offset<GPUOperation> CreateGPUOperation(
     int32_t linkable_count = 0,
     flatbuffers::Offset<flatbuffers::String> elementwise_code = 0) {
   GPUOperationBuilder builder_(_fbb);
+  builder_.add_flops(flops);
   builder_.add_elementwise_code(elementwise_code);
   builder_.add_linkable_count(linkable_count);
   builder_.add_work_groups_count(work_groups_count);
@@ -2391,7 +2398,7 @@ inline flatbuffers::Offset<GPUOperation> CreateGPUOperationDirect(
     tflite::gpu::data::TensorToGrid tensor_to_grid =
         tflite::gpu::data::TensorToGrid::CUSTOM,
     bool elementwise = false, bool linkable = false,
-    bool check_src_channels_size = false,
+    bool check_src_channels_size = false, uint64_t flops = 0,
     flatbuffers::Offset<tflite::gpu::data::OperationDef> definition = 0,
     int32_t grid_dimension = 0,
     flatbuffers::Offset<tflite::gpu::data::Int3> work_group_launch_order = 0,
@@ -2423,7 +2430,7 @@ inline flatbuffers::Offset<GPUOperation> CreateGPUOperationDirect(
       elementwise_code ? _fbb.CreateString(elementwise_code) : 0;
   return tflite::gpu::data::CreateGPUOperation(
       _fbb, arguments, code__, work_group_size, compiler_options__,
-      tensor_to_grid, elementwise, linkable, check_src_channels_size,
+      tensor_to_grid, elementwise, linkable, check_src_channels_size, flops,
       definition, grid_dimension, work_group_launch_order, grid_size,
       src_tensors_names__, dst_tensors_names__, work_groups_count,
       linkable_count, elementwise_code__);
diff --git a/tensorflow/lite/delegates/gpu/metal/inference_context.cc b/tensorflow/lite/delegates/gpu/metal/inference_context.cc
index 702be188e06568..debeb7483a3d2c 100644
--- a/tensorflow/lite/delegates/gpu/metal/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/metal/inference_context.cc
@@ -146,6 +146,46 @@ class TensorReserver {
   ValueId next_;
 };
 
+absl::Status CheckExternalTensorDescription(const GpuInfo& gpu_info,
+                                            const TensorDescriptor& tensor_desc,
+                                            const BHWC& shape,
+                                            DataType data_type) {
+  if (tensor_desc.data_type != data_type) {
+    return absl::InvalidArgumentError(
+        "Global precision and precision of predefined/external tensors must be "
+        "synchronized.");
+  }
+  const bool tensor_supported_layout = tensor_desc.layout == Layout::HWDC ||
+                                       tensor_desc.layout == Layout::BHWDC ||
+                                       tensor_desc.layout == Layout::HWC ||
+                                       tensor_desc.layout == Layout::BHWC;
+  if (!tensor_supported_layout) {
+    return absl::InvalidArgumentError(
+        "Currently no support of this layouts for spatial tensors.");
+  }
+  const bool has_depth =
+      tensor_desc.layout == Layout::HWDC || tensor_desc.layout == Layout::BHWDC;
+  if (has_depth) {
+    return absl::InvalidArgumentError(
+        "Currently no support of Depth dimension in predefined/external "
+        "tensors.");
+  }
+  const bool has_batch =
+      tensor_desc.layout == Layout::BHWC || tensor_desc.layout == Layout::BHWDC;
+  if (has_batch && shape.b == 1) {
+    return absl::InvalidArgumentError("Wrong layout, batch mismatch.");
+  }
+  if (!has_batch && shape.b != 1) {
+    return absl::InvalidArgumentError("Wrong layout, batch mismatch.");
+  }
+  if (!CanCreateTensorWithShape(gpu_info, shape, tensor_desc).ok()) {
+    return absl::UnavailableError(
+        "Current device can not allocate tensor with this shape for "
+        "predefined/external descriptor.");
+  }
+  return absl::OkStatus();
+}
+
 absl::Status ReserveGraphTensors(
     const InferenceContext::CreateInferenceInfo& create_info,
     const GpuInfo& gpu_info, const GraphFloat32& graph,
@@ -155,48 +195,33 @@ absl::Status ReserveGraphTensors(
   auto data_type = DeduceDataTypeFromPrecision(create_info.precision);
   for (auto& t : tensors) {
     const auto shape = graph.GetValue(t->id)->tensor.shape;
-    auto it_preallocated = create_info.preallocated.find(t->id);
+    auto it_immutable_external =
+        create_info.external_immutable_tensors.find(t->id);
+    auto it_mutable_external = create_info.preallocated.find(t->id);
     TensorDescriptor tensor_desc;
-    if (it_preallocated != create_info.preallocated.end()) {
+    if (it_immutable_external != create_info.external_immutable_tensors.end()) {
       if (!(graph.IsGraphInput(t->id) || graph.IsGraphOutput(t->id))) {
         return absl::InvalidArgumentError(
-            "Currently preallocated can be used only for graph inputs/outputs");
+            "Currently external tensors can be used only for graph "
+            "inputs/outputs");
       }
-      tensor_desc = it_preallocated->second;
-      if (tensor_desc.data_type != data_type) {
-        return absl::InvalidArgumentError(
-            "Global precision and precision of preallocated tensors must be "
-            "synchronized.");
-      }
-      const bool tensor_supported_layout =
-          tensor_desc.layout == Layout::HWDC ||
-          tensor_desc.layout == Layout::BHWDC ||
-          tensor_desc.layout == Layout::HWC ||
-          tensor_desc.layout == Layout::BHWC;
-      if (!tensor_supported_layout) {
-        return absl::InvalidArgumentError(
-            "Currently no support of this layouts for spatial tensors.");
-      }
-      const bool has_depth = tensor_desc.layout == Layout::HWDC ||
-                             tensor_desc.layout == Layout::BHWDC;
-      if (has_depth) {
+      // tensor_desc = it_immutable_external->second->GetDescriptor();
+      RETURN_IF_ERROR(CheckExternalTensorDescription(gpu_info, tensor_desc,
+                                                     shape, data_type));
+    } else if (it_mutable_external != create_info.preallocated.end()) {
+      if (!(graph.IsGraphInput(t->id) || graph.IsGraphOutput(t->id))) {
         return absl::InvalidArgumentError(
-            "Currently no support of Depth dimension in predefined tensors.");
-      }
-      const bool has_batch = tensor_desc.layout == Layout::BHWC ||
-                             tensor_desc.layout == Layout::BHWDC;
-      if (has_batch && shape.b == 1) {
-        return absl::InvalidArgumentError("Wrong layout, batch mismatch.");
-      }
-      if (!has_batch && shape.b != 1) {
-        return absl::InvalidArgumentError("Wrong layout, batch mismatch.");
+            "Currently external tensors can be used only for graph "
+            "inputs/outputs");
       }
+      tensor_desc = it_mutable_external->second;
+      RETURN_IF_ERROR(CheckExternalTensorDescription(
+          gpu_info, it_mutable_external->second, shape, data_type));
     } else {
       TensorStorageType storage_type = create_info.storage_type;
       Layout layout = shape.b == 1 ? Layout::HWC : Layout::BHWC;
       if (graph.IsGraphInput(t->id) || graph.IsGraphOutput(t->id)) {
-        // Temporary disabled because no support of SINGLE_TEXTURE_2D in Metal
-        if (false && shape.c < 4 &&
+        if (shape.c < 4 &&
             CanCreateTensorWithShape(
                 gpu_info, shape,
                 TensorDescriptor{data_type,
@@ -450,6 +475,14 @@ absl::Status InferenceContext::InitFromGraph(
   const_tensors_descs_ = std::move(gpu_model.const_tensors);
   tensors_descs_ = std::move(gpu_model.tensors);
 
+  for (const auto& external_tensor : create_info.external_immutable_tensors) {
+    auto* metal_spatial_tensor =
+        dynamic_cast<MetalSpatialTensor*>(external_tensor.second);
+    if (!metal_spatial_tensor) {
+      return absl::InvalidArgumentError("Expected MetalSpatialTensor.");
+    }
+    external_immutable_tensors_[external_tensor.first] = metal_spatial_tensor;
+  }
   RETURN_IF_ERROR(CompileOperations(&metal_device));
   RETURN_IF_ERROR(AllocateTensors(&metal_device, create_info.preallocated));
   BindTensorsToOperations();
@@ -516,7 +549,11 @@ absl::Status InferenceContext::AllocateTensors(
 }
 
 MetalSpatialTensor* InferenceContext::GetTensor(ValueId tensor_id) {
-  if (preallocated_tensors_.find(tensor_id) != preallocated_tensors_.end()) {
+  if (external_immutable_tensors_.find(tensor_id) !=
+      external_immutable_tensors_.end()) {
+    return external_immutable_tensors_[tensor_id];
+  } else if (preallocated_tensors_.find(tensor_id) !=
+             preallocated_tensors_.end()) {
     return &preallocated_tensors_[tensor_id];
   } else if (const_tensors_.find(tensor_id) != const_tensors_.end()) {
     return &const_tensors_[tensor_id];
@@ -580,8 +617,11 @@ absl::Status InferenceContext::UpdateParams(const GpuInfo& gpu_info) {
 
 InferenceContext::TensorMemoryType InferenceContext::GetTensorMemoryType(
     ValueId id) {
-  if (preallocated_tensors_.find(id) != preallocated_tensors_.end()) {
-    return TensorMemoryType::kPreallocated;
+  if (external_immutable_tensors_.find(id) !=
+      external_immutable_tensors_.end()) {
+    return TensorMemoryType::kExternal;
+  } else if (preallocated_tensors_.find(id) != preallocated_tensors_.end()) {
+    return TensorMemoryType::kExternal;
   } else if (const_tensors_.find(id) != const_tensors_.end()) {
     return TensorMemoryType::kConst;
   } else if (IsBufferBased(tensors_descs_[id].storage_type)) {
diff --git a/tensorflow/lite/delegates/gpu/metal/inference_context.h b/tensorflow/lite/delegates/gpu/metal/inference_context.h
index 6a1983dc68ad4b..9b0847cd75d5c3 100644
--- a/tensorflow/lite/delegates/gpu/metal/inference_context.h
+++ b/tensorflow/lite/delegates/gpu/metal/inference_context.h
@@ -81,6 +81,19 @@ class InferenceContext {
     //   3) Layout must be without Batch dimension if tensor.shape.b == 1
     //      Layout must be with Batch dimension if tensor.shape.b != 1
     std::map<ValueId, TensorDescriptor> preallocated;
+
+    // User can provide immutable external tensors for inference context.
+    // Some restrictions apply:
+    //   1) ValueId must be input or output id of GraphFloat32
+    //   2) Provided ptrs must be valid during life of InferenceContext.
+    //   3) data_type must be equal to DeduceDataTypeFromPrecision(precision);
+    //      for example for precision F16, data_type must be FLOAT16
+    //   4) Layout must be without Batch dimension if tensor.shape.b == 1
+    //      Layout must be with Batch dimension if tensor.shape.b != 1
+    // InitFromGraph will fail if gpu can not allocate tensor with requested
+    // tensor descriptor
+    // WARNING: This is an experimental API and subject to change.
+    absl::flat_hash_map<ValueId, GpuSpatialTensor*> external_immutable_tensors;
   };
 
   struct GpuModel {
@@ -165,7 +178,7 @@ class InferenceContext {
     kBuffer,
     kVariable,
     kConst,
-    kPreallocated
+    kExternal
   };
 
   absl::Status CompileOperations(MetalDevice* device);
@@ -192,6 +205,7 @@ class InferenceContext {
   std::vector<ValueId> output_ids_;
   std::map<ValueId, MetalSpatialTensor> preallocated_tensors_;
 
+  absl::flat_hash_map<ValueId, MetalSpatialTensor*> external_immutable_tensors_;
   absl::flat_hash_map<ValueId, TensorDescriptor> const_tensors_descs_;
   std::map<ValueId, MetalSpatialTensor> const_tensors_;
 
diff --git a/tensorflow/lite/delegates/gpu/metal/metal_spatial_tensor.h b/tensorflow/lite/delegates/gpu/metal/metal_spatial_tensor.h
index c74c28df6da9bb..e64e56be7be650 100644
--- a/tensorflow/lite/delegates/gpu/metal/metal_spatial_tensor.h
+++ b/tensorflow/lite/delegates/gpu/metal/metal_spatial_tensor.h
@@ -61,7 +61,7 @@ class MetalSpatialTensor : public GPUObject, public GpuSpatialTensor {
   int Slices() const override { return DivideRoundUp(shape_.c, 4); }
   int Batch() const override { return shape_.b; }
 
-  TensorDescriptor GetDescriptor() const { return descriptor_; }
+  TensorDescriptor GetDescriptor() const override { return descriptor_; }
   DataType GetDataType() const { return descriptor_.data_type; }
   TensorStorageType GetStorageType() const { return descriptor_.storage_type; }
 
diff --git a/tensorflow/lite/delegates/gpu/metal_delegate.mm b/tensorflow/lite/delegates/gpu/metal_delegate.mm
index a7e62c78ae097c..00806e023ed67f 100644
--- a/tensorflow/lite/delegates/gpu/metal_delegate.mm
+++ b/tensorflow/lite/delegates/gpu/metal_delegate.mm
@@ -211,7 +211,6 @@ kernel void ComputeFunction(device int* output_buffer [[buffer(0)]],
     }
     for (auto& input : graph_inputs_) {
       if (input.tensor_id == tensor_index) {
-        input_output_buffers_[input.id] = buffer;
         if (bphwc4_buffers_[input.id] != buffer) {
           bphwc_buffers_updated_ = true;
         }
@@ -222,7 +221,6 @@ kernel void ComputeFunction(device int* output_buffer [[buffer(0)]],
     }
     for (auto& output : graph_outputs_) {
       if (output.tensor_id == tensor_index) {
-        input_output_buffers_[output.id] = buffer;
         if (bphwc4_buffers_[output.id] != buffer) {
           bphwc_buffers_updated_ = true;
         }
@@ -373,27 +371,17 @@ void SetCommandBuffer(id<MTLCommandBuffer> command_buffer) {
           input_tensor.shape,  // .shape
           false,               // .set_externally
       });
-      int bhwc_length = static_cast<int>(sizeof(float) * input_tensor.shape.DimensionsProduct());
+
+      // Create BHWC F32 buffer
+      int bhwc_f32_length =
+          static_cast<int>(sizeof(float) * input_tensor.shape.DimensionsProduct());
+      in_out_bhwc_f32_buffers_[input] =
+          [metal_device_ newBufferWithLength:bhwc_f32_length options:MTLResourceStorageModeShared];
+
       int bphwc4_length =
           static_cast<int>(storage_type_size * GetElementsSizeForPHWC4(input_tensor.shape));
-      id<MTLBuffer> buffer = [metal_device_ newBufferWithLength:bhwc_length
-                                                        options:MTLResourceStorageModeShared];
-      input_output_buffers_[input] = buffer;
-      if (options_.allow_precision_loss || input_tensor.shape.c != 4) {
-        bphwc4_buffers_[input] = [metal_device_ newBufferWithLength:bphwc4_length
-                                                            options:MTLResourceStorageModeShared];
-        if (converter_to_BPHWC4_ == nil) {
-          converter_to_BPHWC4_ =
-              [[TFLBufferConvert alloc] initWithDevice:metal_device_
-                                             isFloat16:options_.allow_precision_loss
-                                       convertToPBHWC4:true];
-          if (converter_to_BPHWC4_ == nil) {
-            return absl::InternalError("Error initialization of input buffer converter");
-          }
-        }
-      } else {
-        bphwc4_buffers_[input] = buffer;
-      }
+      bphwc4_buffers_[input] = [metal_device_ newBufferWithLength:bphwc4_length
+                                                          options:MTLResourceStorageModeShared];
     }
 
     std::vector<::tflite::gpu::ValueId> output_ids;
@@ -409,30 +397,33 @@ void SetCommandBuffer(id<MTLCommandBuffer> command_buffer) {
           output_tensor.shape,  // .shape
           false,                // .set_externally
       });
-      // Create BHWC buffer
+
+      // Create BHWC F32 buffer
       int bhwc_length = static_cast<int>(sizeof(float) * output_tensor.shape.DimensionsProduct());
+      in_out_bhwc_f32_buffers_[output] =
+          [metal_device_ newBufferWithLength:bhwc_length options:MTLResourceStorageModeShared];
+
       int bphwc4_length =
           static_cast<int>(storage_type_size * GetElementsSizeForPHWC4(output_tensor.shape));
-      id<MTLBuffer> buffer = [metal_device_ newBufferWithLength:bhwc_length
-                                                        options:MTLResourceStorageModeShared];
-      input_output_buffers_[output] = buffer;
-      if (options_.allow_precision_loss || output_tensor.shape.c != 4) {
-        bphwc4_buffers_[output] = [metal_device_ newBufferWithLength:bphwc4_length
-                                                             options:MTLResourceStorageModeShared];
-        if (converter_from_BPHWC4_ == nil) {
-          converter_from_BPHWC4_ =
-              [[TFLBufferConvert alloc] initWithDevice:metal_device_
-                                             isFloat16:options_.allow_precision_loss
-                                       convertToPBHWC4:false];
-          if (converter_from_BPHWC4_ == nil) {
-            return absl::InternalError("Error initialization of output buffer converter");
-          }
-        }
-      } else {
-        bphwc4_buffers_[output] = buffer;
-      }
+      bphwc4_buffers_[output] = [metal_device_ newBufferWithLength:bphwc4_length
+                                                           options:MTLResourceStorageModeShared];
+    }
+
+    // allocate converter bhwc->bphwc4
+    converter_to_BPHWC4_ = [[TFLBufferConvert alloc] initWithDevice:metal_device_
+                                                          isFloat16:options_.allow_precision_loss
+                                                    convertToPBHWC4:true];
+    if (converter_to_BPHWC4_ == nil) {
+      return absl::InternalError("Error initialization of input buffer converter");
+    }
+
+    // allocate converter bphwc4->bhwc
+    converter_from_BPHWC4_ = [[TFLBufferConvert alloc] initWithDevice:metal_device_
+                                                            isFloat16:options_.allow_precision_loss
+                                                      convertToPBHWC4:false];
+    if (converter_from_BPHWC4_ == nil) {
+      return absl::InternalError("Error initialization of output buffer converter");
     }
-    bphwc_buffers_updated_ = true;
 
     InferenceContext::CreateInferenceInfo create_info;
     create_info.precision = precision;
@@ -477,17 +468,18 @@ void SetCommandBuffer(id<MTLCommandBuffer> command_buffer) {
 
     // CPU HWC input data conversion to PHWC4 and fill the GPU buffer
     for (const auto& input : graph_inputs_) {
-      if (input.set_externally) continue;
+      if (input.set_externally) {
+        continue;
+      }
       // A user provides data on CPU memory for this buffer - need to copy to MTLBuffer
 
       TfLiteTensor* tensor = &context->tensors[input.tensor_id];
-      void* gpu_ptr = [input_output_buffers_[input.id] contents];
+      void* gpu_ptr = [in_out_bhwc_f32_buffers_[input.id] contents];
       std::memcpy(gpu_ptr, tensor->data.f, input.shape.DimensionsProduct() * sizeof(float));
-      if (input_output_buffers_[input.id] == bphwc4_buffers_[input.id]) continue;
       id<MTLComputeCommandEncoder> input_encoder = [command_buffer computeCommandEncoder];
       [converter_to_BPHWC4_ convertWithEncoder:input_encoder
                                          shape:input.shape
-                                  sourceBuffer:input_output_buffers_[input.id]
+                                  sourceBuffer:in_out_bhwc_f32_buffers_[input.id]
                                convertedBuffer:bphwc4_buffers_[input.id]];
       [input_encoder endEncoding];
     }
@@ -508,13 +500,14 @@ void SetCommandBuffer(id<MTLCommandBuffer> command_buffer) {
     }
 
     for (const auto& output : graph_outputs_) {
-      if (output.set_externally) continue;
-      if (bphwc4_buffers_[output.id] == input_output_buffers_[output.id]) continue;
+      if (output.set_externally) {
+        continue;
+      }
       id<MTLComputeCommandEncoder> output_encoder = [command_buffer computeCommandEncoder];
       [converter_from_BPHWC4_ convertWithEncoder:output_encoder
                                            shape:output.shape
                                     sourceBuffer:bphwc4_buffers_[output.id]
-                                 convertedBuffer:input_output_buffers_[output.id]];
+                                 convertedBuffer:in_out_bhwc_f32_buffers_[output.id]];
       [output_encoder endEncoding];
     }
 
@@ -571,7 +564,7 @@ void SetCommandBuffer(id<MTLCommandBuffer> command_buffer) {
       if (output.set_externally) continue;
       // A user retrieves data on CPU memory for this buffer - need to copy from MTLBuffer.
       TfLiteTensor* tensor = context->tensors + output.tensor_id;
-      const void* gpu_ptr = [input_output_buffers_[output.id] contents];
+      const void* gpu_ptr = [in_out_bhwc_f32_buffers_[output.id] contents];
       std::memcpy(tensor->data.f, gpu_ptr, output.shape.DimensionsProduct() * sizeof(float));
     }
     if (is_quantized_model) {
@@ -609,9 +602,10 @@ void SetCommandBuffer(id<MTLCommandBuffer> command_buffer) {
   absl::flat_hash_map<int, int> quant_conversion_map_;
 
   InferenceContext inference_context_;
-  // input and output buffers are passed into Metal inference engine
-  std::map<::tflite::gpu::ValueId, id<MTLBuffer>> input_output_buffers_;
-  std::map<::tflite::gpu::ValueId, id<MTLBuffer>> bphwc4_buffers_;
+  // Metal bhwc f32 input and output buffers for better conversion performance from cpu tensors
+  // We will memcpy cpu<->gpu and use metal for other conversions(layout changes, for example)
+  std::map<ValueId, id<MTLBuffer>> in_out_bhwc_f32_buffers_;
+  std::map<ValueId, id<MTLBuffer>> bphwc4_buffers_;
   bool bphwc_buffers_updated_ = true;
   TFLBufferConvert* converter_to_BPHWC4_ = nil;
   TFLBufferConvert* converter_from_BPHWC4_ = nil;
diff --git a/tensorflow/lite/delegates/hexagon/BUILD b/tensorflow/lite/delegates/hexagon/BUILD
index a70a95b5062a3d..d2aa9e0d3fedf1 100644
--- a/tensorflow/lite/delegates/hexagon/BUILD
+++ b/tensorflow/lite/delegates/hexagon/BUILD
@@ -66,6 +66,14 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "enable_hexagon_delegate",
+    defines = select({
+        "//tensorflow:arm_any": ["TFLITE_ENABLE_HEXAGON"],
+        "//conditions:default": [],
+    }),
+)
+
 cc_library(
     name = "hexagon_delegate",
     srcs = ["hexagon_delegate.cc"],
@@ -82,7 +90,12 @@ cc_library(
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/delegates/utils:simple_delegate",
-    ],
+    ] + select({
+        "//tensorflow:ios": [],
+        "//tensorflow:ios_x86_64": [],
+        "//tensorflow:macos": [],
+        "//conditions:default": [":enable_hexagon_delegate"],
+    }),
 )
 
 cc_library(
diff --git a/tensorflow/lite/delegates/hexagon/hexagon_nn/adsprpc_interface.cc b/tensorflow/lite/delegates/hexagon/hexagon_nn/adsprpc_interface.cc
new file mode 100644
index 00000000000000..beb066c3f24a2a
--- /dev/null
+++ b/tensorflow/lite/delegates/hexagon/hexagon_nn/adsprpc_interface.cc
@@ -0,0 +1,217 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <sys/stat.h>
+
+#include <cstdio>
+#include <cstring>
+
+#include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn.h"
+#include "tensorflow/lite/delegates/hexagon/hexagon_nn/soc_model.h"
+
+namespace {
+
+void* LoadLibadsprpc() {
+  void* lib = dlopen("libadsprpc.so", RTLD_LAZY | RTLD_LOCAL);
+  if (lib) {
+    fprintf(stdout, "loaded libadsprpc.so\n");
+    return lib;
+  }
+
+  return nullptr;
+}
+
+void* LoadLibcdsprpc() {
+  void* lib = dlopen("libcdsprpc.so", RTLD_LAZY | RTLD_LOCAL);
+  if (lib) {
+    fprintf(stdout, "loaded libcdsprpc.so\n");
+    return lib;
+  }
+
+  return nullptr;
+}
+
+void* LoadDsprpc() {
+  SocSkelTable soc_model = tflite::delegates::getsoc_model();
+  // Use aDSP for 835 and 820, otherwise cDSP.
+  if (soc_model.mode == NON_DOMAINS ||
+      (soc_model.dsp_type != nullptr &&
+       strcmp(soc_model.dsp_type, "adsp") == 0)) {
+    return LoadLibadsprpc();
+  }
+  return LoadLibcdsprpc();
+}
+
+void* LoadFunction(const char* name) {
+  static void* libadsprpc = LoadDsprpc();
+  if (libadsprpc == nullptr) {
+    fprintf(stderr, "libadsprpc handle is NULL\n");
+    return nullptr;
+  }
+  auto* func_pt = dlsym(libadsprpc, name);
+  if (func_pt == nullptr) {
+    fprintf(stderr, "Func %s not available on this device (NULL).\n", name);
+  }
+  return func_pt;
+}
+
+using remote_handle_open_fn = decltype(remote_handle_open);
+using remote_handle64_open_fn = decltype(remote_handle64_open);
+using remote_handle_invoke_fn = decltype(remote_handle_invoke);
+using remote_handle64_invoke_fn = decltype(remote_handle64_invoke);
+using remote_handle_close_fn = decltype(remote_handle_close);
+using remote_handle64_close_fn = decltype(remote_handle64_close);
+using remote_mmap_fn = decltype(remote_mmap);
+using remote_mmap64_fn = decltype(remote_mmap64);
+using remote_munmap_fn = decltype(remote_munmap);
+using remote_munmap64_fn = decltype(remote_munmap64);
+using remote_register_buf_fn = decltype(remote_register_buf);
+using remote_set_mode_fn = decltype(remote_set_mode);
+using remote_handle_control_fn = decltype(remote_handle_control);
+
+struct AdsprpcInterface {
+  remote_handle_open_fn* handle_open_fn =
+      reinterpret_cast<remote_handle_open_fn*>(
+          LoadFunction("remote_handle_open"));
+  remote_handle64_open_fn* handle64_open_fn =
+      reinterpret_cast<remote_handle64_open_fn*>(
+          LoadFunction("remote_handle64_open"));
+  remote_handle_invoke_fn* handle_invoke_fn =
+      reinterpret_cast<remote_handle_invoke_fn*>(
+          LoadFunction("remote_handle_invoke"));
+  remote_handle64_invoke_fn* handle64_invoke_fn =
+      reinterpret_cast<remote_handle64_invoke_fn*>(
+          LoadFunction("remote_handle64_invoke"));
+  remote_handle_close_fn* handle_close_fn =
+      reinterpret_cast<remote_handle_close_fn*>(
+          LoadFunction("remote_handle_close"));
+  remote_handle64_close_fn* handle64_close_fn =
+      reinterpret_cast<remote_handle64_close_fn*>(
+          LoadFunction("remote_handle64_close"));
+  remote_handle_control_fn* handle_control_fn =
+      reinterpret_cast<remote_handle_control_fn*>(
+          LoadFunction("remote_handle_control"));
+  remote_mmap_fn* mmap_fn =
+      reinterpret_cast<remote_mmap_fn*>(LoadFunction("remote_mmap"));
+  remote_munmap_fn* munmap_fn =
+      reinterpret_cast<remote_munmap_fn*>(LoadFunction("remote_munmap"));
+  remote_mmap64_fn* mmap64_fn =
+      reinterpret_cast<remote_mmap64_fn*>(LoadFunction("remote_mmap64"));
+  remote_munmap64_fn* munmap64_fn =
+      reinterpret_cast<remote_munmap64_fn*>(LoadFunction("remote_munmap64"));
+  remote_register_buf_fn* register_buf_fn =
+      reinterpret_cast<remote_register_buf_fn*>(
+          LoadFunction("remote_register_buf"));
+  remote_set_mode_fn* set_mode_fn =
+      reinterpret_cast<remote_set_mode_fn*>(LoadFunction("remote_set_mode"));
+
+  // Returns singleton instance.
+  static AdsprpcInterface* Singleton() {
+    static AdsprpcInterface* instance = new AdsprpcInterface();
+    return instance;
+  }
+};
+
+}  // namespace
+
+extern "C" {
+int remote_handle_open(const char* name, remote_handle* h) {
+  return AdsprpcInterface::Singleton()->handle_open_fn
+             ? AdsprpcInterface::Singleton()->handle_open_fn(name, h)
+             : -1;
+}
+
+int remote_handle64_open(const char* name, remote_handle64* h) {
+  return AdsprpcInterface::Singleton()->handle64_open_fn
+             ? AdsprpcInterface::Singleton()->handle64_open_fn(name, h)
+             : -1;
+}
+
+int remote_handle_invoke(remote_handle h, uint32_t scalars, remote_arg* args) {
+  return AdsprpcInterface::Singleton()->handle_invoke_fn
+             ? AdsprpcInterface::Singleton()->handle_invoke_fn(h, scalars, args)
+             : -1;
+}
+
+int remote_handle64_invoke(remote_handle64 h, uint32_t scalars,
+                           remote_arg* args) {
+  return AdsprpcInterface::Singleton()->handle64_invoke_fn
+             ? AdsprpcInterface::Singleton()->handle64_invoke_fn(h, scalars,
+                                                                 args)
+             : -1;
+}
+
+int remote_handle_close(remote_handle h) {
+  return AdsprpcInterface::Singleton()->handle_close_fn
+             ? AdsprpcInterface::Singleton()->handle_close_fn(h)
+             : -1;
+}
+
+int remote_handle64_close(remote_handle64 h) {
+  return AdsprpcInterface::Singleton()->handle64_close_fn
+             ? AdsprpcInterface::Singleton()->handle64_close_fn(h)
+             : -1;
+}
+
+int remote_handle_control(uint32_t req, void* data, uint32_t datalen) {
+  return AdsprpcInterface::Singleton()->handle_control_fn
+             ? AdsprpcInterface::Singleton()->handle_control_fn(req, data,
+                                                                datalen)
+             : -1;
+}
+
+int remote_mmap(int fd, uint32_t flags, uint32_t addr, int size,
+                uint32_t* result) {
+  return AdsprpcInterface::Singleton()->mmap_fn
+             ? AdsprpcInterface::Singleton()->mmap_fn(fd, flags, addr, size,
+                                                      result)
+             : -1;
+}
+
+int remote_mmap64(int fd, uint32_t flags, uintptr_t vaddrin, int64_t size,
+                  uintptr_t* vaddrout) {
+  return AdsprpcInterface::Singleton()->mmap64_fn
+             ? AdsprpcInterface::Singleton()->mmap64_fn(fd, flags, vaddrin,
+                                                        size, vaddrout)
+             : -1;
+}
+
+int remote_munmap(uint32_t addr, int size) {
+  return AdsprpcInterface::Singleton()->munmap_fn
+             ? AdsprpcInterface::Singleton()->munmap_fn(addr, size)
+             : -1;
+}
+
+int remote_munmap64(uintptr_t vaddrout, int64_t size) {
+  return AdsprpcInterface::Singleton()->munmap64_fn
+             ? AdsprpcInterface::Singleton()->munmap64_fn(vaddrout, size)
+             : -1;
+}
+
+void remote_register_buf(void* buf, int size, int fd) {
+  if (AdsprpcInterface::Singleton()->register_buf_fn) {
+    AdsprpcInterface::Singleton()->register_buf_fn(buf, size, fd);
+  }
+}
+
+int remote_set_mode(uint32_t mode) {
+  return AdsprpcInterface::Singleton()->set_mode_fn
+             ? AdsprpcInterface::Singleton()->set_mode_fn(mode)
+             : -1;
+}
+
+}  // extern "C"
diff --git a/tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn_init.cc b/tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn_init.cc
new file mode 100644
index 00000000000000..2f31f01e89c210
--- /dev/null
+++ b/tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn_init.cc
@@ -0,0 +1,49 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/hexagon/hexagon_nn/hexagon_nn_init.h"
+
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include "hexagon/remote.h"  // NOLINT
+#include "hexagon/rpcmem.h"  // NOLINT
+#include "tensorflow/lite/delegates/hexagon/hexagon_nn/soc_model.h"
+
+extern "C" {
+
+// Version 1.20
+static const int kHexagonNNVersion = 137729;
+#pragma weak remote_handle_control  // Declare it as a weak symbol
+void hexagon_nn_global_init() {
+  rpcmem_init();
+  // Non-domains QoS invocation
+  struct remote_rpc_control_latency data;
+  data.enable = RPC_PM_QOS;
+  if (remote_handle_control) {  // Check if API is available before invoking
+    remote_handle_control(DSPRPC_CONTROL_LATENCY, (void*)&data, sizeof(data));
+  }
+}
+
+void hexagon_nn_global_teardown() { rpcmem_deinit(); }
+
+bool hexagon_nn_is_device_supported() {
+  return tflite::delegates::getsoc_model().mode != UNSPECIFIED_MODE;
+}
+
+int hexagon_nn_hexagon_interface_version() { return kHexagonNNVersion; }
+
+}
diff --git a/tensorflow/lite/delegates/hexagon/hexagon_nn/soc_model.cc b/tensorflow/lite/delegates/hexagon/hexagon_nn/soc_model.cc
new file mode 100644
index 00000000000000..2969843a5709b0
--- /dev/null
+++ b/tensorflow/lite/delegates/hexagon/hexagon_nn/soc_model.cc
@@ -0,0 +1,63 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/delegates/hexagon/hexagon_nn/soc_model.h"
+
+#include <cstdlib>
+
+namespace tflite {
+namespace delegates {
+// Implementation below is similar to the one inside the Hexagon SDK for
+// fetching the SoC information.
+// TODO(b/144536839): Look in sharing the code with Hexagon SDK if possible.
+
+int get_soc_id(int* soc_id) {
+  int fd;
+  if (!access("/sys/devices/soc0/soc_id", F_OK)) {
+    fd = open("/sys/devices/soc0/soc_id", O_RDONLY);
+  } else {
+    fd = open("/sys/devices/system/soc/soc0/id", O_RDONLY);
+  }
+  if (fd == -1) {
+    return -1;
+  }
+
+  char raw_buf[SOC_ID_BUFFER_LENGTH];
+  const int bytes_read = read(fd, raw_buf, SOC_ID_BUFFER_LENGTH - 1);
+  // read returns -1 on failure, so check and return if failed.
+  if (bytes_read == -1) {
+    return -1;  // failure
+  }
+  raw_buf[SOC_ID_BUFFER_LENGTH - 1] = 0;
+  *soc_id = atoi(raw_buf);
+  close(fd);
+
+  return 0;
+}
+
+SocSkelTable getsoc_model() {
+  int soc_id;
+  get_soc_id(&soc_id);
+
+  int i = 0;
+  for (i = 0; socSkelInfo[i].soc_id != 0; i++) {
+    if (socSkelInfo[i].soc_id == soc_id) {
+      return socSkelInfo[i];
+    }
+  }
+
+  return socSkelInfo[i];
+}
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/hexagon/hexagon_nn/soc_model.h b/tensorflow/lite/delegates/hexagon/hexagon_nn/soc_model.h
new file mode 100644
index 00000000000000..5486dad94da257
--- /dev/null
+++ b/tensorflow/lite/delegates/hexagon/hexagon_nn/soc_model.h
@@ -0,0 +1,43 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_NN_SOC_MODEL_H_
+#define TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_NN_SOC_MODEL_H_
+
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cstdio>
+
+#include "hexagon/remote.h"
+#include "hexagon/remote64.h"
+#include "hexagon/hexnn_soc_defines.h"
+
+namespace tflite {
+namespace delegates {
+#define SOC_ID_BUFFER_LENGTH 5
+#define URI_BUFFER_LENGTH 100
+
+// Returns QC SoC ID of the device.
+int get_soc_id(int* soc_id);
+
+// Returns structure that has SoC information.
+SocSkelTable getsoc_model();
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_HEXAGON_HEXAGON_NN_SOC_MODEL_H_
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index 66b439ce819c5d..aeff8a9996d4d4 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -28,6 +28,7 @@ cc_library(
     hdrs = [
         "nnapi_delegate.h",
         "nnapi_delegate_kernel.h",
+        "nnapi_delegate_plugin.h",
     ],
     compatible_with = get_compatible_with_portable(),
     deps = [
@@ -56,6 +57,7 @@ cc_library(
     hdrs = [
         "nnapi_delegate.h",
         "nnapi_delegate_kernel.h",
+        "nnapi_delegate_plugin.h",
     ],
     compatible_with = get_compatible_with_portable(),
     deps = [
@@ -86,6 +88,7 @@ cc_library(
     hdrs = [
         "nnapi_delegate.h",
         "nnapi_delegate_kernel.h",
+        "nnapi_delegate_plugin.h",
     ],
     copts = ["-DNNAPI_VERBOSE_VALIDATION"],
     deps = [
@@ -165,6 +168,7 @@ cc_test(
         ":nnapi_delegate_mock_test",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/kernels:deprecated_backends",
         "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/nnapi:nnapi_implementation",
diff --git a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
index 1d42e9ed1a1657..3db24c73548c03 100644
--- a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
+++ b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
@@ -281,6 +281,9 @@ Parameterized/LstmOpTest.+/7,29
 MaxMinOpTest/.+nt8Test,29
 MaximumOpTest/.+,29
 
+# mirror_pad_test
+MirrorPadTest/.+,1000007
+
 # mul_test
 FloatMulOpTest/.+
 
@@ -391,6 +394,11 @@ ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+HalfPixelCenters.*/0,30
 // Only models with constant size tensor are accelerated
 ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest/.+/0,29
 
+# reverse_test
+-ReverseOpTest/Int64.+
+-ReverseOpTest/Int16.+
+ReverseOpTest/.+,1000007
+
 # select_test
 -SelectOpTest/SelectBool
 -SelectOpTest.SelectInt16
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index ad2ff883a35a6d..5c31a27a129d33 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -757,6 +757,21 @@ TfLiteStatus GetTargetDevices(TfLiteContext* context, TfLiteDelegate* delegate,
   return kTfLiteOk;
 }
 
+// The context to be used with NnapiMappingUtilCInterface.
+class NnapiMappingContext {
+ public:
+  // Next index of ann tensor
+  int next_ann_tensor_index_ = 0;
+  // Mapping from lite tensor index.
+  std::vector<int> lite_tensor_to_ann_tensor_;
+  // Mapping from lite index to a type which tensor must be converted to during
+  // the copying of the data to the memory allocated for NN API. kTfLiteNoType
+  // means no conversion is needed.
+  std::vector<int> index_to_type_conversion_;
+  // Mapping from lite node index.
+  std::vector<int> nnapi_to_tflite_op_mapping_;
+};
+
 }  // namespace
 
 namespace delegate {
@@ -840,19 +855,17 @@ class DequantizeMapping {
 class NNAPIOpBuilder {
  public:
   NNAPIOpBuilder(const NnApi* nnapi, TfLiteContext* context,
-                 OperandMapping* tensor_mapping,
+                 NnapiMappingUtilCInterface* mapping_util,
                  DequantizeMapping* dequantize_mapping,
                  std::map<const MMAPAllocation*, ANeuralNetworksMemory*>*
                      allocation_mapping,
-                 std::vector<int>* nnapi_to_tflite_op_mapping,
                  ANeuralNetworksModel* nn_model, int* nnapi_errno,
                  bool allow_dynamic_dimensions)
       : nnapi_(nnapi),
         context_(context),
-        operand_mapping_(tensor_mapping),
+        mapping_util_(mapping_util),
         dequantize_mapping_(dequantize_mapping),
         allocation_memory_mapping_(allocation_mapping),
-        nnapi_to_tflite_op_mapping_(nnapi_to_tflite_op_mapping),
         nn_model_(nn_model),
         nnapi_errno_(nnapi_errno),
         allow_dynamic_dimensions_(allow_dynamic_dimensions) {}
@@ -1122,7 +1135,7 @@ class NNAPIOpBuilder {
         nnapi_->ANeuralNetworksModel_addOperation(
             nn_model_, type, input_count, inputs, output_count, outputs),
         "adding operation", nnapi_errno_);
-    nnapi_to_tflite_op_mapping_->push_back(lite_node_index);
+    mapping_util_->AddNnapiToTfliteOpMapping(mapping_util_, lite_node_index);
     return kTfLiteOk;
   }
 
@@ -1132,7 +1145,7 @@ class NNAPIOpBuilder {
   TfLiteStatus AddDequantize(int nn_input_index, int lite_tensor_index,
                              TfLiteType dequantized_type, int lite_node_index) {
     const int ann_index =
-        operand_mapping_->lite_index_to_ann(lite_tensor_index);
+        mapping_util_->TfLiteIndexToNnIndex(mapping_util_, lite_tensor_index);
     int dequantized_ann_index =
         dequantize_mapping_->DequantizedAnnIndex(ann_index, dequantized_type);
 
@@ -1148,7 +1161,8 @@ class NNAPIOpBuilder {
           context_,
           nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type),
           "adding operand", nnapi_errno_);
-      dequantized_ann_index = operand_mapping_->add_new_non_tensor_operand();
+      dequantized_ann_index =
+          mapping_util_->AddNewNonTensorOperand(mapping_util_);
 
       // Add Dequantize operation.
       const uint32_t dequantize_input[1] = {static_cast<uint32_t>(ann_index)};
@@ -1473,13 +1487,15 @@ class NNAPIOpBuilder {
         context_,
         nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type),
         "adding operand", tensor, nnapi_errno_);
-    int ann_tensor_index = operand_mapping_->lite_index_to_ann(tensor_index);
+    int ann_tensor_index =
+        mapping_util_->TfLiteIndexToNnIndex(mapping_util_, tensor_index);
     if (ann_tensor_index != -1) {
       augmented_inputs_.push_back(ann_tensor_index);
       return kTfLiteOk;
     }
     // Allocate a new tensor index
-    ann_tensor_index = operand_mapping_->add_new_ann_tensor_index(tensor_index);
+    ann_tensor_index =
+        mapping_util_->AddNewNnTensorIndex(mapping_util_, tensor_index);
     augmented_inputs_.push_back(ann_tensor_index);
 
     const TfLiteType tensor_type = tensor->type;
@@ -1487,7 +1503,8 @@ class NNAPIOpBuilder {
     TF_LITE_ENSURE_OK(context_, GetEquivalentToANNType(context_, nn_type,
                                                        &nn_type_equivalent));
     if (tensor_type != nn_type_equivalent) {
-      operand_mapping_->add_type_conversion(tensor_index, nn_type_equivalent);
+      mapping_util_->AddTypeConversion(mapping_util_, tensor_index,
+                                       nn_type_equivalent);
     }
     return kTfLiteOk;
   }
@@ -1525,7 +1542,7 @@ class NNAPIOpBuilder {
                                             quant_params.zero_point};
 
     const int ann_tensor_index =
-        operand_mapping_->add_delegate_generated_input_ann_tensors_operand();
+        mapping_util_->AddDelegateGeneratedInputAnnTensorOperand(mapping_util_);
 
     RETURN_TFLITE_ERROR_IF_NN_ERROR(
         context_,
@@ -1623,7 +1640,7 @@ class NNAPIOpBuilder {
         context_,
         nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type),
         "adding operand", nnapi_errno_);
-    const int ann_index = operand_mapping_->add_new_non_tensor_operand();
+    const int ann_index = mapping_util_->AddNewNonTensorOperand(mapping_util_);
     RETURN_TFLITE_ERROR_IF_NN_ERROR(
         context_,
         nnapi_->ANeuralNetworksModel_setOperandValue(nn_model_, ann_index,
@@ -1648,7 +1665,7 @@ class NNAPIOpBuilder {
         nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type),
         "adding operand", nnapi_errno_);
 
-    const int ann_index = operand_mapping_->add_new_non_tensor_operand();
+    const int ann_index = mapping_util_->AddNewNonTensorOperand(mapping_util_);
     RETURN_TFLITE_ERROR_IF_NN_ERROR(
         context_,
         nnapi_->ANeuralNetworksModel_setOperandValue(
@@ -1689,7 +1706,7 @@ class NNAPIOpBuilder {
         context_,
         nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type),
         "adding operand", nnapi_errno_);
-    const int ann_index = operand_mapping_->add_new_non_tensor_operand();
+    const int ann_index = mapping_util_->AddNewNonTensorOperand(mapping_util_);
     augmented_outputs_.push_back(ann_index);
     if (ann_index_out) *ann_index_out = ann_index;
     return kTfLiteOk;
@@ -1712,13 +1729,15 @@ class NNAPIOpBuilder {
     const bool need_half2float_conversion =
         tensor_flags & NN_TENSOR_FLAG_HALF_TO_FLOAT_CONVERSION;
 
-    int ann_tensor_index = operand_mapping_->lite_index_to_ann(tensor_index);
+    int ann_tensor_index =
+        mapping_util_->TfLiteIndexToNnIndex(mapping_util_, tensor_index);
     if (ann_tensor_index != -1) {
       indices->push_back(ann_tensor_index);
       return kTfLiteOk;
     }
     // Allocate a new tensor index
-    ann_tensor_index = operand_mapping_->add_new_ann_tensor_index(tensor_index);
+    ann_tensor_index =
+        mapping_util_->AddNewNnTensorIndex(mapping_util_, tensor_index);
 
     // Parameters needed for new type.
     int32_t nn_type = 0;
@@ -1745,7 +1764,8 @@ class NNAPIOpBuilder {
         nn_type = ANEURALNETWORKS_TENSOR_FLOAT16;
         if (need_half2float_conversion) {
           nn_type = ANEURALNETWORKS_TENSOR_FLOAT32;
-          operand_mapping_->add_type_conversion(tensor_index, kTfLiteFloat32);
+          mapping_util_->AddTypeConversion(mapping_util_, tensor_index,
+                                           kTfLiteFloat32);
         }
         break;
       case kTfLiteUInt8:
@@ -1794,7 +1814,8 @@ class NNAPIOpBuilder {
         if (nn_type != ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL) {
           if (need_int8_conversion) {
             zeroPoint += 128;
-            operand_mapping_->add_type_conversion(tensor_index, kTfLiteUInt8);
+            mapping_util_->AddTypeConversion(mapping_util_, tensor_index,
+                                             kTfLiteUInt8);
           }
           if (scale == 0) {
             // QUANT8 tensors with zero scale are not valid in NNAPI.
@@ -1966,7 +1987,7 @@ class NNAPIOpBuilder {
   TfLiteContext* const context_;
 
   // Tracks relationship between indices.
-  OperandMapping* const operand_mapping_;
+  NnapiMappingUtilCInterface* const mapping_util_;
 
   // Keeps mapping of ANN quantized tensor and float data type to equivalent
   // dequantized ANN tensor. For example, tensor #4 (UINT8) + FLOAT32 could map
@@ -1977,10 +1998,6 @@ class NNAPIOpBuilder {
   std::map<const MMAPAllocation*, ANeuralNetworksMemory*>* const
       allocation_memory_mapping_;
 
-  // Tracks for every operation in the NNAPI model the source TfLite model
-  // node index.
-  std::vector<int>* const nnapi_to_tflite_op_mapping_;
-
   // The NNAPI model.
   ANeuralNetworksModel* const nn_model_;
 
@@ -2159,11 +2176,18 @@ bool ExpectIsRestrictedScalesCompliant(const TfLiteContext* context,
 // when called. You can use this function to see if a node is supported
 // (i.e. if the returned MappingFn is null, then the node is not supported).
 bool NNAPIDelegateKernel::Validate(
-    const TfLiteContext* context, int builtin_code, int version,
+    const TfLiteContext* context, const TfLiteRegistration* registration,
     int android_sdk_version, const TfLiteNode* node,
-    bool is_accelerator_specified,
+    bool is_accelerator_specified, NnapiDelegateVendorPlugin* vendor_plugin,
     std::vector<NNAPIValidationFailure>* map_failures) {
   OpValidationContext val_ctx{true, map_failures};
+  if (vendor_plugin) {
+    if (vendor_plugin->ValidateNode(context, registration, node)) {
+      return true;
+    }
+  }
+  auto builtin_code = registration->builtin_code;
+  auto version = registration->version;
   switch (builtin_code) {
     case kTfLiteBuiltinAdd: {
       ExpectMaxOpVersion(version, 2, &val_ctx);
@@ -2735,13 +2759,22 @@ bool NNAPIDelegateKernel::Validate(
     case kTfLiteBuiltinAbs:
     case kTfLiteBuiltinExp:
     case kTfLiteBuiltinLog:
-    case kTfLiteBuiltinRsqrt:
     case kTfLiteBuiltinPow: {
       ExpectOpVersion(version, 1, &val_ctx);
       ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI12,
                                  &val_ctx);
       ExpectIsFloatOperator(context, node, &val_ctx);
     } break;
+    case kTfLiteBuiltinRsqrt: {
+      ExpectOpVersion(version, 2, &val_ctx);
+      ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI12,
+                                 &val_ctx);
+      if (android_sdk_version < kNNAPIRuntimeFeatureLevel7) {
+        ExpectIsFloatOperator(context, node, &val_ctx);
+      } else {
+        ExpectIsFloatOrQuant8Operator(context, node, &val_ctx);
+      }
+    } break;
     case kTfLiteBuiltinSlice: {
       ExpectMaxOpVersion(version, 2, &val_ctx);
       ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI12,
@@ -3365,6 +3398,31 @@ bool NNAPIDelegateKernel::Validate(
              NNAPIValidationFailureType::kUnsupportedInputType,
              "NNAPI does not support broadcast batch matmul", &val_ctx);
     } break;
+    case kTfLiteBuiltinMirrorPad: {
+      ExpectMaxOpVersion(version, 2, &val_ctx);
+      ExpectMinAndroidSdkVersion(android_sdk_version,
+                                 kNNAPIRuntimeFeatureLevel7, &val_ctx);
+      ExpectIsFloatQuant8OrInt32Operator(context, node, &val_ctx);
+
+      const TfLiteIntArrayView input_shape(
+          context->tensors[node->inputs->data[0]].dims);
+      Expect(!HasZeroes(input_shape),
+             NNAPIValidationFailureType::kUnsupportedOperandValue,
+             "NN API pad ops do not support input tensors with no elements",
+             &val_ctx);
+      Expect(node->inputs->size == 2,
+             NNAPIValidationFailureType::kUnsupportedOperatorVariant,
+             "Expecting 2 inputs", &val_ctx);
+    } break;
+    case kTfLiteBuiltinReverseV2: {
+      ExpectMaxOpVersion(version, 3, &val_ctx);
+      ExpectMinAndroidSdkVersion(android_sdk_version,
+                                 kNNAPIRuntimeFeatureLevel7, &val_ctx);
+      ExpectIsFloatQuant8OrInt32Operator(context, node, &val_ctx);
+      Expect(node->inputs->size == 2,
+             NNAPIValidationFailureType::kUnsupportedOperatorVariant,
+             "Expecting 2 inputs", &val_ctx);
+    } break;
     default:
       // All other operators are not mapped.
       AddValidationFailure(NNAPIValidationFailureType::kUnsupportedOperator,
@@ -3376,7 +3434,8 @@ bool NNAPIDelegateKernel::Validate(
 TfLiteStatus NNAPIDelegateKernel::Map(
     TfLiteContext* context, int builtin_code, int version,
     int android_sdk_version, const NNAPIOpMappingArgs& mapping_args,
-    ANeuralNetworksOperationType* nn_op_type) {
+    ANeuralNetworksOperationType* nn_op_type,
+    NnapiDelegateVendorPlugin* vendor_plugin) {
   auto add_zero_bias = [mapping_args](int input_id, int filter_id,
                                       int num_elements) -> void {
     // NNAPI requires a bias tensor, so we allocate a new tensor to fill
@@ -4240,6 +4299,15 @@ TfLiteStatus NNAPIDelegateKernel::Map(
     case kTfLiteBuiltinPack: {
       *nn_op_type = ANEURALNETWORKS_PACK;
     } break;
+    case kTfLiteBuiltinMirrorPad: {
+      auto builtin = reinterpret_cast<TfLiteMirrorPaddingParams*>(
+          mapping_args.node->builtin_data);
+      mapping_args.builder->AddScalarInt32Operand(builtin->mode);
+      *nn_op_type = ANEURALNETWORKS_MIRROR_PAD;
+    } break;
+    case kTfLiteBuiltinReverseV2: {
+      *nn_op_type = ANEURALNETWORKS_REVERSE;
+    } break;
     default:
       // All other operators are not mapped.
       return kTfLiteError;
@@ -4401,6 +4469,10 @@ TfLiteStatus NNAPIDelegateKernel::Prepare(TfLiteContext* context,
             compilation, delegate_options.execution_priority),
         "setting compilation priority", nnapi_errno);
   }
+  if (delegate_options.vendor_compilation_hints && vendor_plugin_) {
+    TF_LITE_ENSURE_STATUS(vendor_plugin_->ConfigureCompilationHints(
+        delegate_options.vendor_compilation_hints, compilation));
+  }
   const int finish_result =
       nnapi_->ANeuralNetworksCompilation_finish(compilation);
   if (finish_result != ANEURALNETWORKS_NO_ERROR) {
@@ -4444,7 +4516,11 @@ TfLiteStatus NNAPIDelegateKernel::GetOperationsSupportedByTargetNnApiDevices(
     return kTfLiteError;
   }
 
-  const auto nnapi_model_size = nnapi_to_tflite_op_mapping_.size();
+  // Get the number of NNAPI operations mapped.
+  NnapiMappingContext* mapping_context =
+      reinterpret_cast<NnapiMappingContext*>(mapping_util_->context);
+  const int nnapi_model_size =
+      mapping_context->nnapi_to_tflite_op_mapping_.size();
 
   // Determine the list of operations the device actually supports
   std::unique_ptr<bool[]> nnapi_ops_support_flags(new bool[nnapi_model_size]);
@@ -4464,7 +4540,8 @@ TfLiteStatus NNAPIDelegateKernel::GetOperationsSupportedByTargetNnApiDevices(
                 });
   for (int nnapi_op_index = 0; nnapi_op_index < nnapi_model_size;
        nnapi_op_index++) {
-    const auto tflite_op_index = nnapi_to_tflite_op_mapping_[nnapi_op_index];
+    const auto tflite_op_index =
+        mapping_context->nnapi_to_tflite_op_mapping_[nnapi_op_index];
     tflite_ops_support_status[tflite_op_index] &=
         nnapi_ops_support_flags[nnapi_op_index];
     if (!tflite_ops_support_status[tflite_op_index]) {
@@ -4531,6 +4608,10 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
                                                        /*reusable=*/true),
           "making execution reusable", nnapi_errno);
     }
+    if (delegate_options.vendor_execution_hints && vendor_plugin_) {
+      TF_LITE_ENSURE_STATUS(vendor_plugin_->ConfigureExecutionHints(
+          delegate_options.vendor_execution_hints, execution));
+    }
     nn_execution_.reset(execution);
 
     // Allow padding bytes for execution inputs & outputs if applicable.
@@ -4572,12 +4653,13 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
             context->tensors[i].allocation_type != kTfLiteMmapRo &&
             // The delegate might not have mapped this input (this can
             // happen if one tensor is split in several ones)
-            operand_mapping_.lite_index_to_ann(i) != -1) {
+            mapping_util_->TfLiteIndexToNnIndex(mapping_util_.get(), i) != -1) {
           if (context->tensors[i].buffer_handle != kTfLiteNullBufferHandle) {
             continue;
           }
           const TfLiteType nn_type_conversion =
-              operand_mapping_.lite_index_to_ann_type_conversion(i);
+              mapping_util_->TfLiteIndexToNnTypeConversion(mapping_util_.get(),
+                                                           i);
           int tensor_size = 0;
           if (nn_type_conversion == kTfLiteNoType) {
             tensor_size = context->tensors[i].bytes;
@@ -4627,8 +4709,8 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
     ANeuralNetworksOperandType* input_nn_operand_type_ptr = nullptr;
     TfLiteTensor* tensor = &context->tensors[absolute_input_index];
     TfLiteType ann_type_equivalent =
-        operand_mapping_.lite_index_to_ann_type_conversion(
-            absolute_input_index);
+        mapping_util_->TfLiteIndexToNnTypeConversion(mapping_util_.get(),
+                                                     absolute_input_index);
     if (delegate_options.allow_dynamic_dimensions &&
         HasUnspecifiedDimension(tensor)) {
       input_nn_operand_type = ConvertTensorTypeToNNType(
@@ -4718,8 +4800,8 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
               "associating NNAPI execution input with a memory object", tensor,
               nnapi_errno);
         }
-      } else if (operand_mapping_.lite_index_to_ann(absolute_input_index) !=
-                 -1) {
+      } else if (mapping_util_->TfLiteIndexToNnIndex(
+                     mapping_util_.get(), absolute_input_index) != -1) {
         // copy data to pre-allocated shared memory.
         memcpy(nn_input_memory_->get_data_ptr() + input_offset,
                tensor->data.raw, tensor->bytes);
@@ -4747,7 +4829,8 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
   for (auto output_index : TfLiteIntArrayView(node->outputs)) {
     // If the NNAPI implementation doesn't have some of the outputs
     // they are left unmapped and we should not try to read their value here
-    if (operand_mapping_.lite_index_to_ann(output_index) == -1) {
+    if (mapping_util_->TfLiteIndexToNnIndex(mapping_util_.get(),
+                                            output_index) == -1) {
       continue;
     }
     ANeuralNetworksOperandType output_nn_operand_type;
@@ -4756,7 +4839,8 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
     if (delegate_options.allow_dynamic_dimensions &&
         HasUnspecifiedDimension(tensor)) {
       TfLiteType ann_type_equivalent =
-          operand_mapping_.lite_index_to_ann_type_conversion(output_index);
+          mapping_util_->TfLiteIndexToNnTypeConversion(mapping_util_.get(),
+                                                       output_index);
       output_nn_operand_type = ConvertTensorTypeToNNType(
           tensor, ann_type_equivalent, use_int8_asymm_signed);
       output_nn_operand_type_ptr = &output_nn_operand_type;
@@ -4848,7 +4932,8 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
       continue;
     }
     TfLiteType ann_type_equivalent =
-        operand_mapping_.lite_index_to_ann_type_conversion(output_index);
+        mapping_util_->TfLiteIndexToNnTypeConversion(mapping_util_.get(),
+                                                     output_index);
     if (tensor->type == kTfLiteInt8 && ann_type_equivalent == kTfLiteUInt8) {
       // Explicitly convert uint8 values to int8 values.
       uint8_t* output_ptr = reinterpret_cast<uint8_t*>(
@@ -5025,15 +5110,137 @@ TfLiteStatus NNAPIDelegateKernel::DensifyAndDequantizeConstTensor(
   return kTfLiteOk;
 }
 
+TfLiteIntArray* ResizeTfLiteIntArray(TfLiteIntArray* old_array, int new_size,
+                                     int init_value) {
+  TfLiteIntArray* ret = TfLiteIntArrayCreate(new_size);
+  if (ret) {
+    int size_to_copy = 0;
+    if (old_array) {
+      size_to_copy = new_size > old_array->size ? old_array->size : new_size;
+      memcpy(ret->data, old_array->data, size_to_copy * sizeof(int));
+    }
+    for (int i = size_to_copy; i < ret->size; i++) {
+      ret->data[i] = init_value;
+    }
+  }
+  TfLiteIntArrayFree(old_array);
+  return ret;
+}
+
+void NNFreeMappingUtil::operator()(NnapiMappingUtilCInterface* mapping_util) {
+  NnapiMappingContext* mapping_context =
+      reinterpret_cast<NnapiMappingContext*>(mapping_util->context);
+  delete (mapping_context);
+  mapping_util->context = nullptr;
+  free(mapping_util);
+}
+
+class NnapiMappingUtilCInterfaceImpl {
+ public:
+  static int TfLiteIndexToNnIndex(NnapiMappingUtilCInterface* mapping,
+                                  int index) {
+    NnapiMappingContext* mapping_context =
+        reinterpret_cast<NnapiMappingContext*>(mapping->context);
+    const size_t max_size = mapping_context->lite_tensor_to_ann_tensor_.size();
+    if (index >= 0 && index < max_size)
+      return mapping_context->lite_tensor_to_ann_tensor_[index];
+    else
+      return -1;
+  }
+
+  static int AddNewNonTensorOperand(NnapiMappingUtilCInterface* mapping) {
+    NnapiMappingContext* mapping_context =
+        reinterpret_cast<NnapiMappingContext*>(mapping->context);
+    return mapping_context->next_ann_tensor_index_++;
+  }
+
+  static int AddDelegateGeneratedInputAnnTensorOperand(
+      NnapiMappingUtilCInterface* mapping) {
+    NnapiMappingContext* mapping_context =
+        reinterpret_cast<NnapiMappingContext*>(mapping->context);
+    return mapping_context->next_ann_tensor_index_++;
+  }
+
+  static int AddNewNnTensorIndex(NnapiMappingUtilCInterface* mapping,
+                                 int tflite_index) {
+    NnapiMappingContext* mapping_context =
+        reinterpret_cast<NnapiMappingContext*>(mapping->context);
+    const size_t current_size =
+        mapping_context->lite_tensor_to_ann_tensor_.size();
+    if (tflite_index >= current_size) {
+      mapping_context->lite_tensor_to_ann_tensor_.resize(tflite_index + 1, -1);
+    }
+    const int new_tensor_index = mapping_context->next_ann_tensor_index_++;
+    mapping_context->lite_tensor_to_ann_tensor_[tflite_index] =
+        new_tensor_index;
+    return new_tensor_index;
+  }
+
+  static TfLiteType TfLiteIndexToNnTypeConversion(
+      NnapiMappingUtilCInterface* mapping, int index) {
+    NnapiMappingContext* mapping_context =
+        reinterpret_cast<NnapiMappingContext*>(mapping->context);
+    const size_t max_size = mapping_context->index_to_type_conversion_.size();
+    if (index >= 0 && index < max_size)
+      return static_cast<TfLiteType>(
+          mapping_context->index_to_type_conversion_[index]);
+    else
+      return kTfLiteNoType;
+  }
+
+  static void AddTypeConversion(NnapiMappingUtilCInterface* mapping,
+                                int tflite_index, TfLiteType tflite_type) {
+    NnapiMappingContext* mapping_context =
+        reinterpret_cast<NnapiMappingContext*>(mapping->context);
+    const size_t current_size =
+        mapping_context->index_to_type_conversion_.size();
+    if (tflite_index >= current_size) {
+      mapping_context->index_to_type_conversion_.resize(tflite_index + 1,
+                                                        kTfLiteNoType);
+    }
+    mapping_context->index_to_type_conversion_[tflite_index] = tflite_type;
+  }
+
+  static void AddNnapiToTfliteOpMapping(NnapiMappingUtilCInterface* mapping,
+                                        int tflite_node_index) {
+    NnapiMappingContext* mapping_context =
+        reinterpret_cast<NnapiMappingContext*>(mapping->context);
+    mapping_context->nnapi_to_tflite_op_mapping_.push_back(tflite_node_index);
+  }
+};
+
+NnapiMappingUtilCInterface*
+NNAPIDelegateKernel::NnapiMappingUtilCInterfaceCreate() {
+  NnapiMappingUtilCInterface* mapping =
+      static_cast<NnapiMappingUtilCInterface*>(
+          malloc(sizeof(NnapiMappingUtilCInterface)));
+  mapping->context = new NnapiMappingContext();
+  mapping->TfLiteIndexToNnIndex =
+      NnapiMappingUtilCInterfaceImpl::TfLiteIndexToNnIndex;
+  mapping->AddNewNonTensorOperand =
+      NnapiMappingUtilCInterfaceImpl::AddNewNonTensorOperand;
+  mapping->AddDelegateGeneratedInputAnnTensorOperand =
+      NnapiMappingUtilCInterfaceImpl::AddDelegateGeneratedInputAnnTensorOperand;
+  mapping->AddNewNnTensorIndex =
+      NnapiMappingUtilCInterfaceImpl::AddNewNnTensorIndex;
+  mapping->TfLiteIndexToNnTypeConversion =
+      NnapiMappingUtilCInterfaceImpl::TfLiteIndexToNnTypeConversion;
+  mapping->AddTypeConversion =
+      NnapiMappingUtilCInterfaceImpl::AddTypeConversion;
+  mapping->AddNnapiToTfliteOpMapping =
+      NnapiMappingUtilCInterfaceImpl::AddNnapiToTfliteOpMapping;
+  return mapping;
+}
+
 TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(
     TfLiteContext* context, int* nnapi_errno, bool allow_dynamic_dimensions) {
   DequantizeMapping dequantize_mapping;
   // The operand builder allows creating a single op. It is created outside
   // the for loop to avoid reallocating the vectors.
-  NNAPIOpBuilder builder(nnapi_, context, &operand_mapping_,
+  NNAPIOpBuilder builder(nnapi_, context, mapping_util_.get(),
                          &dequantize_mapping, &allocation_memory_mapping_,
-                         &nnapi_to_tflite_op_mapping_, nn_model_.get(),
-                         nnapi_errno, allow_dynamic_dimensions);
+                         nn_model_.get(), nnapi_errno,
+                         allow_dynamic_dimensions);
   // If we have target accelerators the target SDK version might be
   // different than the current android version.
   target_feature_level_ = nnapi_->nnapi_runtime_feature_level;
@@ -5075,6 +5282,12 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(
       continue;
     }
 
+    // Use vendor plugin to map the node if needed.
+    if (vendor_plugin_ && vendor_plugin_->ValidateNode(context, reg, node)) {
+      TF_LITE_ENSURE_STATUS(vendor_plugin_->MapNode(
+          context, node, node_index, mapping_util_.get(), nn_model_.get()));
+      continue;
+    }
     // Delegate PACK by lowering it into CONCAT + RESHAPE.
     if (reg->builtin_code == kTfLiteBuiltinPack &&
         target_feature_level_ < kNNAPIRuntimeFeatureLevel6) {
@@ -5696,13 +5909,14 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph(
         context->tensors[i].allocation_type != kTfLiteMmapRo &&
         // The delegate might not have mapped this input (this can
         // happen if one tensor is split in several ones)
-        operand_mapping_.lite_index_to_ann(i) != -1) {
-      inputs.push_back(operand_mapping_.lite_index_to_ann(i));
+        mapping_util_->TfLiteIndexToNnIndex(mapping_util_.get(), i) != -1) {
+      inputs.push_back(
+          mapping_util_->TfLiteIndexToNnIndex(mapping_util_.get(), i));
       if (context->tensors[i].buffer_handle != kTfLiteNullBufferHandle) {
         continue;
       }
       const TfLiteType nn_type_conversion =
-          operand_mapping_.lite_index_to_ann_type_conversion(i);
+          mapping_util_->TfLiteIndexToNnTypeConversion(mapping_util_.get(), i);
       int tensor_size = 0;
       if (nn_type_conversion == kTfLiteNoType) {
         tensor_size = context->tensors[i].bytes;
@@ -5719,7 +5933,8 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph(
 
   size_t total_output_byte_size = 0;
   for (int i : TfLiteIntArrayView(output_tensors)) {
-    const int output_tensor_ann_index = operand_mapping_.lite_index_to_ann(i);
+    const int output_tensor_ann_index =
+        mapping_util_->TfLiteIndexToNnIndex(mapping_util_.get(), i);
     // Unmapped outputs are not added
     if (output_tensor_ann_index != -1) {
       outputs.push_back(output_tensor_ann_index);
@@ -5838,6 +6053,10 @@ void StatefulNnApiDelegate::StatefulNnApiDelegateConstructorImpl(
     delegate_data_.allow_dynamic_dimensions = options.allow_dynamic_dimensions;
   }
   delegate_data_.use_burst_computation = options.use_burst_computation;
+  delegate_data_.vendor_compilation_hints = options.vendor_compilation_hints;
+  delegate_data_.vendor_execution_hints = options.vendor_execution_hints;
+  delegate_data_.vendor_plugin = options.vendor_plugin;
+
   TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
                        "Created TensorFlow Lite delegate for NNAPI.");
   Prepare = DoPrepare;
@@ -5901,6 +6120,9 @@ const StatefulNnApiDelegate::Options StatefulNnApiDelegate::GetOptions(
       delegate_data->max_execution_loop_timeout_duration_ns;
   options.allow_dynamic_dimensions = delegate_data->allow_dynamic_dimensions;
   options.use_burst_computation = delegate_data->use_burst_computation;
+  options.vendor_compilation_hints = delegate_data->vendor_compilation_hints;
+  options.vendor_execution_hints = delegate_data->vendor_execution_hints;
+  options.vendor_plugin = delegate_data->vendor_plugin;
   return options;
 }
 
@@ -5988,7 +6210,7 @@ TfLiteStatus StatefulNnApiDelegate::GetNodesSupportedByAccelerator(
   for (int idx = 0; idx < *num_partitions; idx++) {
     const auto& partition_params = (*params_array)[idx];
     std::unique_ptr<NNAPIDelegateKernel> kernel_state(
-        new NNAPIDelegateKernel(nnapi));
+        new NNAPIDelegateKernel(nnapi, delegate_data->vendor_plugin));
     TfLiteDelegateParams params_with_delegate = partition_params;
     params_with_delegate.delegate = delegate;
     TF_LITE_ENSURE_STATUS(
@@ -6074,8 +6296,8 @@ static std::vector<int> GetSupportedOpsWithFp16WeightRemapping(
           std::string* unsupported_details) -> bool {
     std::vector<delegate::nnapi::NNAPIValidationFailure> map_failures;
     const auto is_supported = NNAPIDelegateKernel::Validate(
-        context, registration->builtin_code, registration->version,
-        target_feature_level, node, is_accelerator_specified, &map_failures);
+        context, registration, target_feature_level, node,
+        is_accelerator_specified, nullptr, &map_failures);
     if (!is_supported) {
       if (unsupported_details) {
         for (auto& failure : map_failures) {
@@ -6195,8 +6417,8 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
       TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
           context, node_index, &node, &registration));
       if (NNAPIDelegateKernel::Validate(
-              context, registration->builtin_code, registration->version,
-              target_feature_level, node, is_accelerator_specified,
+              context, registration, target_feature_level, node,
+              is_accelerator_specified, delegate_options.vendor_plugin,
               &map_failures)) {
         supported_nodes.push_back(node_index);
       }
@@ -6233,7 +6455,8 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context,
         NNAPIDelegateKernel* kernel_state =
             delegate_data->MaybeGetCachedDelegateKernel(params);
         if (!kernel_state) {
-          kernel_state = new NNAPIDelegateKernel(delegate_data->nnapi);
+          kernel_state = new NNAPIDelegateKernel(delegate_data->nnapi,
+                                                 delegate_data->vendor_plugin);
           kernel_state->Init(context, params, nnapi_errno);
         }
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
index 6d3a7239574b6c..8fe985eb6c54fc 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
 
 struct NnApiSLDriverImplFL5;
+struct NnapiDelegateVendorPlugin;
 typedef struct ANeuralNetworksMemory ANeuralNetworksMemory;
 
 namespace tflite {
@@ -137,6 +138,22 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // performance.
     // Default: Disabled for devices with NNAPI feature level 4 or lower.
     bool use_burst_computation = false;
+
+    // The optional null-terminated vendor specific compilation hints string.
+    // It is the vendor_plugin's responsibility to parse the hint string and
+    // decide whether the hints should be respected or not. If no vendor_plugin
+    // provided, the hints will be ignored.
+    const char* vendor_compilation_hints = nullptr;
+
+    // The optional null-terminated vendor specific execution hints string.
+    // It is the vendor_plugin's responsibility to parse the hint string and
+    // decide whether the hints should be respected or not. If no vendor_plugin
+    // provided, the hints will be ignored.
+    const char* vendor_execution_hints = nullptr;
+
+    // It is the users responsibility to make sure that
+    // vendor_plugin outlives the delegate instance.
+    NnapiDelegateVendorPlugin* vendor_plugin = nullptr;
   };
 
   // Uses default options.
@@ -296,6 +313,14 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     bool allow_dynamic_dimensions = false;
     // Whether to use NNAPI Burst mode.
     bool use_burst_computation = false;
+    // The null-terminated vendor specific compilation hints string
+    const char* vendor_compilation_hints = nullptr;
+    // The null-terminated vendor specific execution hints string.
+    const char* vendor_execution_hints = nullptr;
+
+    // It is the users responsibility to make sure that
+    // vendor_plugin outlives the delegate instance.
+    NnapiDelegateVendorPlugin* vendor_plugin = nullptr;
 
     // Smart pointer for automatically cleaning up NnApi structure in case the
     // delegate was constructed from an NNAPI support library
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
index 96f4f3ba23c6a9..af0ffcdd0bd634 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate_plugin.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
 
 namespace tflite {
@@ -36,77 +37,7 @@ constexpr int32_t kMinSdkVersionForNNAPI13 = 30;
 // kNNAPIRuntimeFeatureLevel*.
 constexpr int32_t kNNAPIRuntimeFeatureLevel5 = 31;
 constexpr int32_t kNNAPIRuntimeFeatureLevel6 = 1000006;
-
-// Track tensor indices to NN API tensor indices mapping.
-class OperandMapping {
- public:
-  // Given a TFLite index return the ANN index. If it doesn't exist
-  // return -1.
-  int lite_index_to_ann(int index) const {
-    const int64_t max_size = lite_tensor_to_ann_tensor_.size();
-    if (index >= 0 && index < max_size)
-      return lite_tensor_to_ann_tensor_[index];
-    else
-      return -1;
-  }
-
-  // NN API uses non tensor operands instead of structs. This creates one
-  // and returns the index. It uses a std::vector and resizes it as needed
-  // keeping -1 to unmapped values. Intermediate tensors likely will not
-  // be mapped.
-  int add_new_non_tensor_operand() { return next_ann_tensor_index_++; }
-
-  // This call is necessary for input operands generated by the delegate
-  // to map constant inputs not present in TFLite but required by NNAPI,
-  // for example when splitting one input in several ones.
-  int add_delegate_generated_input_ann_tensors_operand() {
-    return next_ann_tensor_index_++;
-  }
-
-  // Add a new mapping from `tflite_index` and return the NN API tensor index.
-  int add_new_ann_tensor_index(int tflite_index) {
-    const int64_t current_size = lite_tensor_to_ann_tensor_.size();
-    if (tflite_index >= current_size) {
-      lite_tensor_to_ann_tensor_.resize(tflite_index + 1, -1);
-    }
-    const int new_tensor_index = next_ann_tensor_index_++;
-    lite_tensor_to_ann_tensor_[tflite_index] = new_tensor_index;
-    return new_tensor_index;
-  }
-
-  // Given a TFLite index returns a TFLite type to which a tensor must be
-  // converted during copying the data to the memory allocated for NN API.
-  // kTfLiteNoType means no conversion is needed.
-  TfLiteType lite_index_to_ann_type_conversion(int index) const {
-    const int64_t max_size = index_to_type_conversion_.size();
-    if (index >= 0 && index < max_size)
-      return index_to_type_conversion_[index];
-    else
-      return kTfLiteNoType;
-  }
-
-  // Add a new mapping from TFLite index to a type conversion.
-  void add_type_conversion(int tflite_index, TfLiteType tflite_type) {
-    const int64_t current_size = index_to_type_conversion_.size();
-    if (tflite_index >= current_size) {
-      index_to_type_conversion_.resize(tflite_index + 1, kTfLiteNoType);
-    }
-    index_to_type_conversion_[tflite_index] = tflite_type;
-  }
-
- private:
-  // Next index of ann tensor
-  int next_ann_tensor_index_ = 0;
-
-  // Mapping from lite index. Use a std::vector for speed and code size
-  // rather than a map.
-  std::vector<int> lite_tensor_to_ann_tensor_;
-  // Mapping from lite index to a type which tensor must be converted to during
-  // the copying of the data to the memory allocated for NN API. kTfLiteNoType
-  // means no conversion is needed. Use an std::vector for speed and code size
-  // rather than a map.
-  std::vector<TfLiteType> index_to_type_conversion_;
-};
+constexpr int32_t kNNAPIRuntimeFeatureLevel7 = 1000007;
 
 class NNAPIOpBuilder;
 
@@ -171,6 +102,12 @@ class NNFreeBurst {
   const NnApi* nnapi_;
 };
 
+// RAII NN API MappingUtil Destructor for use with std::unique_ptr
+class NNFreeMappingUtil {
+ public:
+  void operator()(NnapiMappingUtilCInterface* mapping_util);
+};
+
 // Manage NNAPI shared memory handle
 class NNMemory {
  public:
@@ -264,13 +201,16 @@ struct NNAPIValidationFailure {
 // The kernel that represents the node sub set of TF Lite being run on NN API.
 class NNAPIDelegateKernel {
  public:
-  explicit NNAPIDelegateKernel(const NnApi* nnapi)
+  explicit NNAPIDelegateKernel(
+      const NnApi* nnapi, NnapiDelegateVendorPlugin* vendor_plugin = nullptr)
       : initialised_(false),
         nnapi_(nnapi),
         nn_model_(nullptr, NNFreeModel(nnapi_)),
         nn_compilation_(nullptr, NNFreeCompilation(nnapi_)),
         nn_burst_(nullptr, NNFreeBurst(nnapi_)),
-        nn_execution_(nullptr, NNFreeExecution(nnapi_)) {}
+        nn_execution_(nullptr, NNFreeExecution(nnapi_)),
+        mapping_util_(NnapiMappingUtilCInterfaceCreate(), NNFreeMappingUtil()),
+        vendor_plugin_(vendor_plugin) {}
   NNAPIDelegateKernel() : NNAPIDelegateKernel(NnApiImplementation()) {}
   ~NNAPIDelegateKernel() {
     for (auto content : allocation_memory_mapping_) {
@@ -278,6 +218,8 @@ class NNAPIDelegateKernel {
     }
   }
 
+  static NnapiMappingUtilCInterface* NnapiMappingUtilCInterfaceCreate();
+
   // Translate a node into its operands
   // It assumes that the call to Validate for has been successful for
   // the operation.
@@ -287,13 +229,15 @@ class NNAPIDelegateKernel {
   static TfLiteStatus Map(TfLiteContext* context, int builtin_code, int version,
                           int android_sdk_version,
                           const NNAPIOpMappingArgs& mapping_args,
-                          ANeuralNetworksOperationType* nn_op_type);
+                          ANeuralNetworksOperationType* nn_op_type,
+                          NnapiDelegateVendorPlugin* vendor_plugin = nullptr);
 
   // Returns true if the node can be accelerated with NNAPI.
   static bool Validate(
-      const TfLiteContext* context, int builtin_code, int version,
+      const TfLiteContext* context, const TfLiteRegistration* registration,
       int android_sdk_version, const TfLiteNode* node,
       bool is_accelerator_specified,
+      NnapiDelegateVendorPlugin* vendor_plugin = nullptr,
       // Collects lists of failures collected during
       // the validation of the possibility of accelerating
       // the given node
@@ -353,7 +297,8 @@ class NNAPIDelegateKernel {
   // indexes into the nodes array in the TfLiteContext.
   std::vector<int> nodes_;
   // Track indices we use
-  OperandMapping operand_mapping_;
+  std::unique_ptr<NnapiMappingUtilCInterface, NNFreeMappingUtil> mapping_util_;
+
   std::map<const MMAPAllocation*, ANeuralNetworksMemory*>
       allocation_memory_mapping_;
   // Track memory map
@@ -371,13 +316,14 @@ class NNAPIDelegateKernel {
 
   std::vector<uint8_t> nn_compilation_cache_token_;
 
-  std::vector<int> nnapi_to_tflite_op_mapping_;
   // Map of DENSIFY output tensor id to node id.
   std::vector<int> densify_output_to_node_mapping_;
   // Map of DEQUANTIZE output tensor id to node id.
   // Only contains DEQUANTIZE nodes with non-const input.
   std::vector<int> non_const_dequantize_output_to_node_mapping_;
 
+  NnapiDelegateVendorPlugin* vendor_plugin_ = nullptr;
+
   // Fully initialized in NNAPIDelegateKernel::AddOpsAndTensors
   int target_feature_level_ = 27;  // kMinSdkVersionForNNAPI10
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_plugin.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_plugin.h
new file mode 100644
index 00000000000000..7b3b0f7f404ffc
--- /dev/null
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_plugin.h
@@ -0,0 +1,104 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_PLUGIN_H_
+#define TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_PLUGIN_H_
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// The mapping utils intended for vendor plugin to track:
+//   - TFLite tensor indices to NN API tensor indices mapping.
+//   - TFLite node indices to NN API operation indices mapping.
+// WARNING: This is an experimental interface that is subject to change.
+typedef struct NnapiMappingUtilCInterface {
+  // Given a TFLite index, return the ANN index. If it doesn't exist
+  // return -1.
+  int (*TfLiteIndexToNnIndex)(NnapiMappingUtilCInterface* mapping, int index);
+
+  // When adding a non-tensor TFLite node parameter to NNAPI as an
+  // ANeuralNetworksOperand, notify NNAPI delegate to increment the operand
+  // count.
+  int (*AddNewNonTensorOperand)(NnapiMappingUtilCInterface* mapping);
+
+  // When adding a TFLite tensor to NNAPI as an ANeuralNetworksOperand, notify
+  // NNAPI delegate to add a new mapping from `tflite_index` and return the NN
+  // API tensor index.
+  int (*AddNewNnTensorIndex)(NnapiMappingUtilCInterface* mapping,
+                             int tflite_index);
+
+  // When adding a TFLite tensor to NNAPI as multiple ANeuralNetworksOperand
+  // objects, for example when splitting one input into several ones, notify
+  // NNAPI delegate to increment the operand count.
+  int (*AddDelegateGeneratedInputAnnTensorOperand)(
+      NnapiMappingUtilCInterface* mapping);
+
+  // Given a TFLite index returns a TFLite type to which a tensor must be
+  // converted during copying the data to the memory allocated for NN API.
+  // kTfLiteNoType means no conversion is needed.
+  TfLiteType (*TfLiteIndexToNnTypeConversion)(
+      NnapiMappingUtilCInterface* mapping, int index);
+
+  // Add a new mapping from TFLite tensor index to a type conversion.
+  void (*AddTypeConversion)(NnapiMappingUtilCInterface* mapping,
+                            int tflite_index, TfLiteType tflite_type);
+
+  // Add a new mapping from TFLite node index to NNAPI op index.
+  void (*AddNnapiToTfliteOpMapping)(NnapiMappingUtilCInterface* mapping,
+                                    int tflite_node_index);
+
+  // opaque handle for the mapping context. Only intended for the NNAPI Delegate
+  // to use.
+  void* context;
+} NnapiMappingUtilCInterface;
+
+// The interface for NNAPI Vendor Plugin.
+// The interface exposes necessary functionalities for NNAPI delegate to
+// interact with the vendor plugin.
+// WARNING: This is an experimental interface that is subject to change.
+typedef struct NnapiDelegateVendorPlugin {
+  // Validate whether the given TFLite node is supported by the plugin.
+  bool (*ValidateNode)(const TfLiteContext* context,
+                       const TfLiteRegistration* registration,
+                       const TfLiteNode* node);
+
+  // Translate a TFLite node into corresponding NNAPI operands and operation.
+  // It assumes that the call to Validate for has been successful for
+  // the operation. In case of success it returns kTfLiteOk and stores the
+  // corresponding NNAPI operand indices and operation code through the mapping
+  // utility interface. Returns kTfLiteError in case of failures during mapping.
+  TfLiteStatus (*MapNode)(const TfLiteContext* context, const TfLiteNode* node,
+                          int node_index, NnapiMappingUtilCInterface* mapping,
+                          ANeuralNetworksModel* model);
+
+  // Parse the provided compilation_hints string and configure it for the given
+  // ANeuralNetworksCompilation handle.
+  TfLiteStatus (*ConfigureCompilationHints)(
+      const char* compilation_hints, ANeuralNetworksCompilation* compilation);
+
+  // Parse the provided execution_hints string and configure it for the given
+  // ANeuralNetworksExecution handle.
+  TfLiteStatus (*ConfigureExecutionHints)(const char* execution_hints,
+                                          ANeuralNetworksExecution* execution);
+} NnapiDelegateVendorPlugin;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_PLUGIN_H_
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index 6834d10e26b96c..fce957924a9ae0 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -20,6 +20,8 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate_plugin.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/model.h"
@@ -5405,6 +5407,155 @@ TEST(NNAPIDelegate, LeakyReluQuantized) {
                                  },
                                  kQuantizedTolerance)));
 }
+}  // namespace
+
+namespace ops {
+namespace builtin {
+TfLiteRegistration* Register_FLOOR();
+}  // namespace builtin
+}  // namespace ops
+
+namespace {
+// The "nnapi-custom-op" is just float32 floor.
+static const char kTestCustomOp[] = "nnapi-custom-op";
+class NnapiTestVendorPlugin : public NnapiDelegateVendorPlugin {
+ public:
+  NnapiTestVendorPlugin() {
+    ValidateNode = DoValidateNode;
+    MapNode = DoMapNode;
+    ConfigureCompilationHints = DoConfigureCompilationHints;
+    ConfigureExecutionHints = DoConfigureExecutionHints;
+  }
+
+  static bool DoValidateNode(const TfLiteContext* context,
+                             const TfLiteRegistration* registration,
+                             const TfLiteNode* node) {
+    if (strcmp(kTestCustomOp, registration->custom_name) != 0) {
+      return false;
+    }
+    if (node->inputs->size != 1 || node->outputs->size != 1) {
+      return false;
+    }
+    if (context->tensors[node->inputs->data[(0)]].type != kTfLiteFloat32 ||
+        context->tensors[node->outputs->data[(0)]].type != kTfLiteFloat32) {
+      return false;
+    }
+    return true;
+  }
+
+  static TfLiteStatus AddFloat32Tensor(const TfLiteContext* context,
+                                       int tensor_index,
+                                       NnapiMappingUtilCInterface* mapping,
+                                       std::vector<uint32_t>* indices,
+                                       ANeuralNetworksModel* model) {
+    int ann_tensor_index = mapping->TfLiteIndexToNnIndex(mapping, tensor_index);
+    if (ann_tensor_index != -1) {
+      indices->push_back(ann_tensor_index);
+      return kTfLiteOk;
+    }
+    // Allocate a new tensor index
+    ann_tensor_index = mapping->AddNewNnTensorIndex(mapping, tensor_index);
+    TfLiteTensor* tensor = &context->tensors[tensor_index];
+    ANeuralNetworksOperandType operand_type{
+        .type = ANEURALNETWORKS_TENSOR_FLOAT32,
+        .dimensionCount = static_cast<uint32_t>(tensor->dims->size),
+        .dimensions = reinterpret_cast<uint32_t*>(tensor->dims->data),
+        .scale = 0.0f,
+        .zeroPoint = 0,
+    };
+    EXPECT_EQ(NnApiImplementation()->ANeuralNetworksModel_addOperand(
+                  model, &operand_type),
+              ANEURALNETWORKS_NO_ERROR);
+    if (tensor->allocation_type == kTfLiteMmapRo) {
+      EXPECT_EQ(NnApiImplementation()->ANeuralNetworksModel_setOperandValue(
+                    model, ann_tensor_index, tensor->data.data, tensor->bytes),
+                ANEURALNETWORKS_NO_ERROR);
+    }
+    indices->push_back(ann_tensor_index);
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus DoMapNode(const TfLiteContext* context,
+                                const TfLiteNode* node, int node_index,
+                                NnapiMappingUtilCInterface* mapping,
+                                ANeuralNetworksModel* model) {
+    std::vector<uint32_t> input_indices;
+    std::vector<uint32_t> output_indices;
+    for (int input_pos = 0; input_pos < node->inputs->size; ++input_pos) {
+      const auto input_index = node->inputs->data[input_pos];
+      EXPECT_EQ(AddFloat32Tensor(context, input_index, mapping, &input_indices,
+                                 model),
+                kTfLiteOk);
+    }
+    for (int output_pos = 0; output_pos < node->outputs->size; ++output_pos) {
+      const auto output_index = node->outputs->data[output_pos];
+      EXPECT_EQ(AddFloat32Tensor(context, output_index, mapping,
+                                 &output_indices, model),
+                kTfLiteOk);
+    }
+    EXPECT_EQ(
+        NnApiImplementation()->ANeuralNetworksModel_addOperation(
+            model, ANEURALNETWORKS_FLOOR,
+            static_cast<uint32_t>(input_indices.size()), input_indices.data(),
+            static_cast<uint32_t>(output_indices.size()),
+            output_indices.data()),
+        ANEURALNETWORKS_NO_ERROR);
+    mapping->AddNnapiToTfliteOpMapping(mapping, node_index);
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus DoConfigureCompilationHints(
+      const char* compilation_hints, ANeuralNetworksCompilation* compilation) {
+    return kTfLiteOk;
+  }
+
+  static TfLiteStatus DoConfigureExecutionHints(
+      const char* execution_hints, ANeuralNetworksExecution* execution) {
+    return kTfLiteOk;
+  }
+};
+
+class CustomFloorOpModel : public SingleOpModelWithNNAPI {
+ public:
+  CustomFloorOpModel(const StatefulNnApiDelegate::Options& options,
+                     const TensorData& input, const TensorData& output,
+                     bool allow_fp32_relax_to_fp16 = false)
+      : SingleOpModelWithNNAPI(options) {
+    Init(input, output, allow_fp32_relax_to_fp16);
+  }
+
+  int input() { return input_; }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ protected:
+  int input_;
+  int output_;
+
+ private:
+  // Performs initialization logic shared across all constructors.
+  void Init(const TensorData& input, const TensorData& output,
+            bool allow_fp32_relax_to_fp16 = false) {
+    input_ = AddInput(input);
+    output_ = AddOutput(output);
+    SetCustomOp(kTestCustomOp, {}, tflite::ops::builtin::Register_FLOOR);
+    BuildInterpreterWithNNAPI({GetShape(input_)}, allow_fp32_relax_to_fp16);
+  }
+};
+
+TEST(NNAPIDelegate, CustomFloorVendorExtension) {
+  NnapiTestVendorPlugin* vendor_plugin = new NnapiTestVendorPlugin();
+  StatefulNnApiDelegate::Options options;
+  options.accelerator_name = "nnapi-reference";
+  options.vendor_plugin = vendor_plugin;
+
+  CustomFloorOpModel m(options, {TensorType_FLOAT32, {1, 2, 2, 1}},
+                       {TensorType_FLOAT32, {1, 2, 2, 1}});
+  m.PopulateTensor<float>(m.input(), {0, 0.2, 1.7, 2.8});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0.0, 0.0, 1.0, 2.0}));
+  delete vendor_plugin;
+}
 
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 4bba02a2fba8a2..bd2900a06a2be3 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -359,6 +359,40 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "quantized_reduce_tester",
+    testonly = 1,
+    srcs = ["quantized_reduce_tester.cc"],
+    hdrs = ["quantized_reduce_tester.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_conversion_utils",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
+cc_library(
+    name = "quantized_resize_bilinear_tester",
+    testonly = 1,
+    srcs = ["quantized_resize_bilinear_tester.cc"],
+    hdrs = ["quantized_resize_bilinear_tester.h"],
+    deps = [
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/schema:schema_conversion_utils",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_googletest//:gtest",
+        "@flatbuffers",
+    ],
+)
+
 cc_library(
     name = "quantized_unary_elementwise_tester",
     testonly = 1,
@@ -1096,6 +1130,21 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "signed_quantized_mean_test",
+    srcs = ["signed_quantized_mean_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":quantized_reduce_tester",
+        ":test_main",
+        ":xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "signed_quantized_mul_test",
     srcs = ["signed_quantized_mul_test.cc"],
@@ -1126,6 +1175,21 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "signed_quantized_resize_bilinear_test",
+    srcs = ["signed_quantized_resize_bilinear_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":quantized_resize_bilinear_tester",
+        ":test_main",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "signed_quantized_sub_test",
     srcs = ["signed_quantized_sub_test.cc"],
@@ -1141,6 +1205,21 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "signed_quantized_transpose_conv_test",
+    srcs = ["signed_quantized_transpose_conv_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":quantized_transpose_conv_tester",
+        ":test_main",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "softmax_test",
     srcs = ["softmax_test.cc"],
@@ -1216,6 +1295,21 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "transpose_conv_test",
+    srcs = ["transpose_conv_test.cc"],
+    linkopts = select({
+        "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":test_main",
+        ":transpose_conv_tester",
+        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate_test_mode",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "unsigned_quantized_add_test",
     srcs = ["unsigned_quantized_add_test.cc"],
@@ -1307,14 +1401,14 @@ cc_test(
 )
 
 cc_test(
-    name = "unsigned_quantized_mul_test",
-    srcs = ["unsigned_quantized_mul_test.cc"],
+    name = "unsigned_quantized_mean_test",
+    srcs = ["unsigned_quantized_mean_test.cc"],
     linkopts = select({
         "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
         "//conditions:default": [],
     }),
     deps = [
-        ":quantized_binary_elementwise_tester",
+        ":quantized_reduce_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "@com_google_googletest//:gtest",
@@ -1322,14 +1416,14 @@ cc_test(
 )
 
 cc_test(
-    name = "unsigned_quantized_pad_test",
-    srcs = ["unsigned_quantized_pad_test.cc"],
+    name = "unsigned_quantized_mul_test",
+    srcs = ["unsigned_quantized_mul_test.cc"],
     linkopts = select({
         "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
         "//conditions:default": [],
     }),
     deps = [
-        ":quantized_pad_tester",
+        ":quantized_binary_elementwise_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "@com_google_googletest//:gtest",
@@ -1337,14 +1431,14 @@ cc_test(
 )
 
 cc_test(
-    name = "unsigned_quantized_sub_test",
-    srcs = ["unsigned_quantized_sub_test.cc"],
+    name = "unsigned_quantized_pad_test",
+    srcs = ["unsigned_quantized_pad_test.cc"],
     linkopts = select({
         "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
         "//conditions:default": [],
     }),
     deps = [
-        ":quantized_binary_elementwise_tester",
+        ":quantized_pad_tester",
         ":test_main",
         ":xnnpack_delegate_test_mode",
         "@com_google_googletest//:gtest",
@@ -1352,31 +1446,31 @@ cc_test(
 )
 
 cc_test(
-    name = "transpose_conv_test",
-    srcs = ["transpose_conv_test.cc"],
+    name = "unsigned_quantized_resize_bilinear_test",
+    srcs = ["unsigned_quantized_resize_bilinear_test.cc"],
     linkopts = select({
         "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
         "//conditions:default": [],
     }),
     deps = [
+        ":quantized_resize_bilinear_tester",
         ":test_main",
-        ":transpose_conv_tester",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate_test_mode",
         "@com_google_googletest//:gtest",
     ],
 )
 
 cc_test(
-    name = "signed_quantized_transpose_conv_test",
-    srcs = ["signed_quantized_transpose_conv_test.cc"],
+    name = "unsigned_quantized_sub_test",
+    srcs = ["unsigned_quantized_sub_test.cc"],
     linkopts = select({
         "//tensorflow:emscripten": EMSCRIPTEN_LINKOPTS,
         "//conditions:default": [],
     }),
     deps = [
-        ":quantized_transpose_conv_tester",
+        ":quantized_binary_elementwise_tester",
         ":test_main",
-        "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate_test_mode",
+        ":xnnpack_delegate_test_mode",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/lite/delegates/xnnpack/README.md b/tensorflow/lite/delegates/xnnpack/README.md
index f42617d0911283..116c74c06c967a 100644
--- a/tensorflow/lite/delegates/xnnpack/README.md
+++ b/tensorflow/lite/delegates/xnnpack/README.md
@@ -236,7 +236,7 @@ Below is the list of currently supported floating-point operators:
 
 #### `MEAN`
 
-* The first input and the output must be a 4D tensors in 32-bit
+* The first input and the output must be 4D tensors in 32-bit
   floating-point format.
 * The second input (the input with the axes specification) must be static
   (use `kTfLiteMmapRo` allocation type).
@@ -401,6 +401,15 @@ Below is the list of currently supported quantized operators:
 * Fused `NONE`, `RELU`, `RELU_N1_TO_1`, and `RELU6` activations are supported,
   but fused `TANH` and `SIGN_BIT` activations are not.
 
+#### `MEAN`
+
+* The first input and the output must be 4D tensors in 8-bit quantized format.
+* The second input (the input with the axes specification) must be static
+  (use `kTfLiteMmapRo` allocation type).
+* Only [1, 2] or [2, 1] axes specification (i.e. reduction across spatial
+  dimensions) is supported.
+* Only `keep_dims = True` parameter value is supported.
+
 #### `MUL`
 
 * Inputs and outputs must be in 8-bit quantized format.
@@ -414,6 +423,12 @@ Below is the list of currently supported quantized operators:
   (use `kTfLiteMmapRo` allocation type).
 * The numbers of padding elements must be non-negative.
 
+#### `RESIZE_BILINEAR`
+
+* The first input and the output must be 4D tensors in 8-bit quantized format.
+* The second input (the input with the new shape specification) must be
+  static (use `kTfLiteMmapRo` allocation type).
+
 #### `SUB`
 
 * Inputs and outputs must be in 8-bit quantized format.
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_reduce_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_reduce_tester.cc
new file mode 100644
index 00000000000000..1966113b15ff05
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/quantized_reduce_tester.cc
@@ -0,0 +1,195 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/xnnpack/quantized_reduce_tester.h"
+
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+template <class T>
+void QuantizedReduceTester::Test(Interpreter* delegate_interpreter,
+                                 Interpreter* default_interpreter) const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng = std::bind(
+      std::uniform_int_distribution<int32_t>(std::numeric_limits<T>::min(),
+                                             std::numeric_limits<T>::max()),
+      std::ref(rng));
+
+  T* default_input_data = default_interpreter->typed_input_tensor<T>(0);
+  std::generate(default_input_data, default_input_data + InputSize(),
+                std::ref(input_rng));
+
+  T* delegate_input_data = delegate_interpreter->typed_input_tensor<T>(0);
+  std::copy(default_input_data, default_input_data + InputSize(),
+            delegate_input_data);
+
+  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
+
+  T* default_output_data = default_interpreter->typed_output_tensor<T>(0);
+  T* delegate_output_data = delegate_interpreter->typed_output_tensor<T>(0);
+
+  const int32_t output_size = OutputSize();
+  for (size_t i = 0; i < output_size; i++) {
+    ASSERT_LE(std::abs(static_cast<int32_t>(default_output_data[i]) -
+                       static_cast<int32_t>(delegate_output_data[i])),
+              1)
+        << "default " << static_cast<int32_t>(default_output_data[i])
+        << ", delegate " << static_cast<int32_t>(delegate_output_data[i])
+        << " at index " << i << " / " << output_size;
+  }
+}
+
+void QuantizedReduceTester::Test(tflite::BuiltinOperator reduce_op,
+                                 TfLiteDelegate* delegate) const {
+  std::vector<char> buffer = CreateTfLiteModel(reduce_op);
+  const Model* model = GetModel(buffer.data());
+
+  std::unique_ptr<Interpreter> delegate_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
+          &delegate_interpreter),
+      kTfLiteOk);
+  std::unique_ptr<Interpreter> default_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
+          &default_interpreter),
+      kTfLiteOk);
+
+  ASSERT_TRUE(delegate_interpreter);
+  ASSERT_TRUE(default_interpreter);
+
+  ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
+  ASSERT_EQ(default_interpreter->inputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
+  ASSERT_EQ(default_interpreter->outputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
+
+  ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
+
+  if (Unsigned()) {
+    Test<uint8_t>(delegate_interpreter.get(), default_interpreter.get());
+  } else {
+    Test<int8_t>(delegate_interpreter.get(), default_interpreter.get());
+  }
+}
+
+std::vector<char> QuantizedReduceTester::CreateTfLiteModel(
+    tflite::BuiltinOperator reduce_op) const {
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<OperatorCode> operator_code =
+      CreateOperatorCode(builder, reduce_op);
+
+  const std::array<flatbuffers::Offset<Buffer>, 2> buffers{{
+      CreateBuffer(builder, builder.CreateVector({})),
+      CreateBuffer(builder, builder.CreateVector(
+                                reinterpret_cast<const uint8_t*>(Axes().data()),
+                                sizeof(int32_t) * Axes().size())),
+  }};
+
+  const std::vector<int32_t> output_shape = OutputShape();
+  const std::array<int32_t, 1> axes_shape{
+      {static_cast<int32_t>(Axes().size())}};
+  const std::array<flatbuffers::Offset<Tensor>, 3> tensors{{
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(InputShape().data(),
+                                                 InputShape().size()),
+                   Unsigned() ? TensorType_UINT8 : TensorType_INT8,
+                   /*buffer=*/0, /*name=*/0,
+                   CreateQuantizationParameters(
+                       builder, /*min=*/0, /*max=*/0,
+                       builder.CreateVector<float>({InputScale()}),
+                       builder.CreateVector<int64_t>({InputZeroPoint()}))),
+      CreateTensor(
+          builder,
+          builder.CreateVector<int32_t>(axes_shape.data(), axes_shape.size()),
+          TensorType_INT32, /*buffer=*/1),
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(output_shape.data(),
+                                                 output_shape.size()),
+                   Unsigned() ? TensorType_UINT8 : TensorType_INT8,
+                   /*buffer=*/0, /*name=*/0,
+                   CreateQuantizationParameters(
+                       builder, /*min=*/0, /*max=*/0,
+                       builder.CreateVector<float>({OutputScale()}),
+                       builder.CreateVector<int64_t>({OutputZeroPoint()}))),
+  }};
+
+  const flatbuffers::Offset<ReducerOptions> reducer_options =
+      CreateReducerOptions(builder, KeepDims());
+
+  const std::array<int32_t, 2> op_inputs{{0, 1}};
+  const std::array<int32_t, 1> op_outputs{{2}};
+  flatbuffers::Offset<Operator> op = CreateOperator(
+      builder, /*opcode_index=*/0,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
+      tflite::BuiltinOptions_ReducerOptions, reducer_options.Union());
+
+  const std::array<int32_t, 1> subgraph_inputs{{0}};
+  const std::array<int32_t, 1> subgraph_outputs{{2}};
+  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+      builder, builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                    subgraph_inputs.size()),
+      builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                    subgraph_outputs.size()),
+      builder.CreateVector(&op, 1));
+
+  flatbuffers::Offset<flatbuffers::String> description =
+      builder.CreateString("Quantized Reduce model");
+
+  flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1), description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+
+  builder.Finish(model_buffer);
+
+  return std::vector<char>(builder.GetBufferPointer(),
+                           builder.GetBufferPointer() + builder.GetSize());
+}
+
+int32_t QuantizedReduceTester::ComputeSize(const std::vector<int32_t>& shape) {
+  return std::accumulate(shape.cbegin(), shape.cend(), 1,
+                         std::multiplies<int32_t>());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_reduce_tester.h b/tensorflow/lite/delegates/xnnpack/quantized_reduce_tester.h
new file mode 100644
index 00000000000000..8eb7f50f3d4537
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/quantized_reduce_tester.h
@@ -0,0 +1,155 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_REDUCE_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_REDUCE_TESTER_H_
+
+#include <cstdint>
+#include <unordered_set>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class QuantizedReduceTester {
+ public:
+  QuantizedReduceTester() = default;
+  QuantizedReduceTester(const QuantizedReduceTester&) = delete;
+  QuantizedReduceTester& operator=(const QuantizedReduceTester&) = delete;
+
+  inline QuantizedReduceTester& InputShape(
+      std::initializer_list<int32_t> shape) {
+    for (auto it = shape.begin(); it != shape.end(); ++it) {
+      EXPECT_GT(*it, 0);
+    }
+    input_shape_ = std::vector<int32_t>(shape.begin(), shape.end());
+    input_size_ = QuantizedReduceTester::ComputeSize(input_shape_);
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& InputShape() const { return input_shape_; }
+
+  inline int32_t InputSize() const { return input_size_; }
+
+  inline QuantizedReduceTester& Axes(std::initializer_list<int32_t> axes) {
+    for (auto it = axes.begin(); it != axes.end(); ++it) {
+      EXPECT_GE(*it, 0);
+    }
+    axes_ = std::vector<int32_t>(axes.begin(), axes.end());
+    return *this;
+  }
+
+  inline const std::vector<int32_t>& Axes() const { return axes_; }
+
+  inline QuantizedReduceTester& KeepDims(bool keep_dims) {
+    keep_dims_ = keep_dims;
+    return *this;
+  }
+
+  inline bool KeepDims() const { return keep_dims_; }
+
+  inline std::vector<int32_t> OutputShape() const {
+    std::vector<int32_t> output_shape;
+    output_shape.reserve(InputShape().size());
+    std::unordered_set<int32_t> axes_set(Axes().cbegin(), Axes().cend());
+    for (int32_t i = 0; i < InputShape().size(); i++) {
+      if (axes_set.count(i) != 0) {
+        if (KeepDims()) {
+          output_shape.push_back(1);
+        }
+      } else {
+        output_shape.push_back(InputShape()[i]);
+      }
+    }
+    return output_shape;
+  }
+
+  inline int32_t OutputSize() const {
+    int32_t output_size = 1;
+    std::unordered_set<int32_t> axes_set(Axes().cbegin(), Axes().cend());
+    for (int32_t i = 0; i < InputShape().size(); i++) {
+      if (axes_set.count(i) == 0) {
+        output_size *= InputShape()[i];
+      }
+    }
+    return output_size;
+  }
+
+  inline QuantizedReduceTester& InputZeroPoint(int32_t input_zero_point) {
+    input_zero_point_ = input_zero_point;
+    return *this;
+  }
+
+  inline int32_t InputZeroPoint() const { return input_zero_point_; }
+
+  inline QuantizedReduceTester& OutputZeroPoint(int32_t output_zero_point) {
+    output_zero_point_ = output_zero_point;
+    return *this;
+  }
+
+  inline int32_t OutputZeroPoint() const { return output_zero_point_; }
+
+  inline QuantizedReduceTester& InputScale(float input_scale) {
+    input_scale_ = input_scale;
+    return *this;
+  }
+
+  inline float InputScale() const { return input_scale_; }
+
+  inline QuantizedReduceTester& OutputScale(float output_scale) {
+    output_scale_ = output_scale;
+    return *this;
+  }
+
+  inline float OutputScale() const { return output_scale_; }
+
+  inline QuantizedReduceTester& Unsigned(bool is_unsigned) {
+    unsigned_ = is_unsigned;
+    return *this;
+  }
+
+  inline bool Unsigned() const { return unsigned_; }
+
+  template <class T>
+  void Test(Interpreter* delegate_interpreter,
+            Interpreter* default_interpreter) const;
+
+  void Test(tflite::BuiltinOperator reduce_op, TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel(tflite::BuiltinOperator reduce_op) const;
+
+  static int32_t ComputeSize(const std::vector<int32_t>& shape);
+
+  std::vector<int32_t> input_shape_;
+  std::vector<int32_t> axes_;
+  int32_t input_size_;
+  bool keep_dims_ = true;
+  int32_t input_zero_point_ = 1;
+  int32_t output_zero_point_ = 2;
+  float input_scale_ = 1.25f;
+  float output_scale_ = 0.75f;
+  bool unsigned_ = false;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_REDUCE_TESTER_H_
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.cc
new file mode 100644
index 00000000000000..bf1ad7cc637ded
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.cc
@@ -0,0 +1,206 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.h"
+
+#include <array>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model.h"
+#include "tensorflow/lite/schema/schema_conversion_utils.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+namespace tflite {
+namespace xnnpack {
+
+template <class T>
+void QuantizedResizeBilinearTester::Test(
+    Interpreter* delegate_interpreter, Interpreter* default_interpreter) const {
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng = std::bind(
+      std::uniform_int_distribution<int32_t>(std::numeric_limits<T>::min(),
+                                             std::numeric_limits<T>::max()),
+      std::ref(rng));
+
+  T* default_input_data = default_interpreter->typed_input_tensor<T>(0);
+  std::generate(default_input_data,
+                default_input_data +
+                    BatchSize() * InputHeight() * InputWidth() * Channels(),
+                std::ref(input_rng));
+
+  T* delegate_input_data = delegate_interpreter->typed_input_tensor<T>(0);
+  std::copy(default_input_data,
+            default_input_data +
+                BatchSize() * InputHeight() * InputWidth() * Channels(),
+            delegate_input_data);
+
+  ASSERT_EQ(default_interpreter->Invoke(), kTfLiteOk);
+  ASSERT_EQ(delegate_interpreter->Invoke(), kTfLiteOk);
+
+  T* default_output_data = default_interpreter->typed_output_tensor<T>(0);
+  T* delegate_output_data = delegate_interpreter->typed_output_tensor<T>(0);
+
+  for (int i = 0; i < BatchSize(); i++) {
+    for (int y = 0; y < OutputHeight(); y++) {
+      for (int x = 0; x < OutputWidth(); x++) {
+        for (int c = 0; c < Channels(); c++) {
+          const int index =
+              ((i * OutputHeight() + y) * OutputWidth() + x) * Channels() + c;
+          ASSERT_LE(std::abs(static_cast<int32_t>(default_output_data[index]) -
+                             static_cast<int32_t>(delegate_output_data[index])),
+                    1)
+              << "batch " << i << " / " << BatchSize() << ", y position " << y
+              << " / " << OutputHeight() << ", x position " << x << " / "
+              << OutputWidth() << ", channel " << c << " / " << Channels();
+        }
+      }
+    }
+  }
+}
+
+void QuantizedResizeBilinearTester::Test(TfLiteDelegate* delegate) const {
+  std::vector<char> buffer = CreateTfLiteModel();
+  const Model* model = GetModel(buffer.data());
+
+  std::unique_ptr<Interpreter> delegate_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
+          &delegate_interpreter),
+      kTfLiteOk);
+  std::unique_ptr<Interpreter> default_interpreter;
+  ASSERT_EQ(
+      InterpreterBuilder(
+          model,
+          ::tflite::ops::builtin::BuiltinOpResolverWithoutDefaultDelegates())(
+          &default_interpreter),
+      kTfLiteOk);
+
+  ASSERT_TRUE(delegate_interpreter);
+  ASSERT_TRUE(default_interpreter);
+
+  ASSERT_EQ(delegate_interpreter->inputs().size(), 1);
+  ASSERT_EQ(default_interpreter->inputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->outputs().size(), 1);
+  ASSERT_EQ(default_interpreter->outputs().size(), 1);
+
+  ASSERT_EQ(delegate_interpreter->AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(default_interpreter->AllocateTensors(), kTfLiteOk);
+
+  ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate), kTfLiteOk);
+
+  if (Unsigned()) {
+    Test<uint8_t>(delegate_interpreter.get(), default_interpreter.get());
+  } else {
+    Test<int8_t>(delegate_interpreter.get(), default_interpreter.get());
+  }
+}
+
+std::vector<char> QuantizedResizeBilinearTester::CreateTfLiteModel() const {
+  flatbuffers::FlatBufferBuilder builder;
+  flatbuffers::Offset<OperatorCode> operator_code =
+      CreateOperatorCode(builder, BuiltinOperator_RESIZE_BILINEAR);
+
+  flatbuffers::Offset<tflite::ResizeBilinearOptions> resize_bilinear_options =
+      CreateResizeBilinearOptions(builder, AlignCorners(), HalfPixelCenters());
+
+  const std::array<int32_t, 2> size_data{{OutputHeight(), OutputWidth()}};
+
+  const std::array<flatbuffers::Offset<Buffer>, 2> buffers{{
+      CreateBuffer(builder, builder.CreateVector({})),
+      CreateBuffer(builder,
+                   builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(size_data.data()),
+                       size_data.size() * sizeof(int32_t))),
+  }};
+
+  const std::array<int32_t, 4> input_shape{
+      {BatchSize(), InputHeight(), InputWidth(), Channels()}};
+  const std::array<int32_t, 4> output_shape{
+      {BatchSize(), OutputHeight(), OutputWidth(), Channels()}};
+  const std::array<int32_t, 1> size_shape{
+      {static_cast<int32_t>(size_data.size())}};
+
+  const std::array<flatbuffers::Offset<Tensor>, 3> tensors{{
+      CreateTensor(
+          builder,
+          builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
+          Unsigned() ? TensorType_UINT8 : TensorType_INT8,
+          /*buffer=*/0, /*name=*/0,
+          CreateQuantizationParameters(
+              builder, /*min=*/0, /*max=*/0,
+              builder.CreateVector<float>({Scale()}),
+              builder.CreateVector<int64_t>({ZeroPoint()}))),
+      CreateTensor(
+          builder,
+          builder.CreateVector<int32_t>(size_shape.data(), size_shape.size()),
+          TensorType_INT32, /*buffer=*/1),
+      CreateTensor(builder,
+                   builder.CreateVector<int32_t>(output_shape.data(),
+                                                 output_shape.size()),
+                   Unsigned() ? TensorType_UINT8 : TensorType_INT8,
+                   /*buffer=*/0, /*name=*/0,
+                   CreateQuantizationParameters(
+                       builder, /*min=*/0, /*max=*/0,
+                       builder.CreateVector<float>({Scale()}),
+                       builder.CreateVector<int64_t>({ZeroPoint()}))),
+  }};
+
+  const std::array<int32_t, 2> op_inputs{{0, 1}};
+  const std::array<int32_t, 1> op_outputs{{2}};
+  flatbuffers::Offset<Operator> op = CreateOperator(
+      builder, /*opcode_index=*/0,
+      builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+      builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
+      BuiltinOptions_ResizeBilinearOptions, resize_bilinear_options.Union());
+
+  const std::array<int32_t, 1> subgraph_inputs{{0}};
+  const std::array<int32_t, 1> subgraph_outputs{{2}};
+  flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+      builder, builder.CreateVector(tensors.data(), tensors.size()),
+      builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                    subgraph_inputs.size()),
+      builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                    subgraph_outputs.size()),
+      builder.CreateVector(&op, 1));
+
+  flatbuffers::Offset<flatbuffers::String> description =
+      builder.CreateString("Quantized Resize Bilinear model");
+
+  flatbuffers::Offset<Model> model_buffer = CreateModel(
+      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder.CreateVector(&subgraph, 1), description,
+      builder.CreateVector(buffers.data(), buffers.size()));
+
+  builder.Finish(model_buffer);
+
+  return std::vector<char>(builder.GetBufferPointer(),
+                           builder.GetBufferPointer() + builder.GetSize());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.h b/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.h
new file mode 100644
index 00000000000000..6ac294dd130c3d
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.h
@@ -0,0 +1,145 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_RESIZE_BILINEAR_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_RESIZE_BILINEAR_TESTER_H_
+
+#include <cstdint>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace xnnpack {
+
+class QuantizedResizeBilinearTester {
+ public:
+  QuantizedResizeBilinearTester() = default;
+  QuantizedResizeBilinearTester(const QuantizedResizeBilinearTester&) = delete;
+  QuantizedResizeBilinearTester& operator=(
+      const QuantizedResizeBilinearTester&) = delete;
+
+  inline QuantizedResizeBilinearTester& BatchSize(int32_t batch_size) {
+    EXPECT_GT(batch_size, 0);
+    batch_size_ = batch_size;
+    return *this;
+  }
+
+  inline int32_t BatchSize() const { return batch_size_; }
+
+  inline QuantizedResizeBilinearTester& Channels(int32_t channels) {
+    EXPECT_GT(channels, 0);
+    channels_ = channels;
+    return *this;
+  }
+
+  inline int32_t Channels() const { return channels_; }
+
+  inline QuantizedResizeBilinearTester& InputHeight(int32_t input_height) {
+    EXPECT_GT(input_height, 0);
+    input_height_ = input_height;
+    return *this;
+  }
+
+  inline int32_t InputHeight() const { return input_height_; }
+
+  inline QuantizedResizeBilinearTester& InputWidth(int32_t input_width) {
+    EXPECT_GT(input_width, 0);
+    input_width_ = input_width;
+    return *this;
+  }
+
+  inline int32_t InputWidth() const { return input_width_; }
+
+  inline QuantizedResizeBilinearTester& OutputHeight(int32_t output_height) {
+    EXPECT_GT(output_height, 0);
+    output_height_ = output_height;
+    return *this;
+  }
+
+  inline int32_t OutputHeight() const { return output_height_; }
+
+  inline QuantizedResizeBilinearTester& OutputWidth(int32_t output_width) {
+    EXPECT_GT(output_width, 0);
+    output_width_ = output_width;
+    return *this;
+  }
+
+  inline int32_t OutputWidth() const { return output_width_; }
+
+  QuantizedResizeBilinearTester& AlignCorners(bool align_corners) {
+    align_corners_ = align_corners;
+    return *this;
+  }
+
+  bool AlignCorners() const { return align_corners_; }
+
+  QuantizedResizeBilinearTester& HalfPixelCenters(bool half_pixel_centers) {
+    half_pixel_centers_ = half_pixel_centers;
+    return *this;
+  }
+
+  bool HalfPixelCenters() const { return half_pixel_centers_; }
+
+  inline QuantizedResizeBilinearTester& ZeroPoint(int32_t zero_point) {
+    zero_point_ = zero_point;
+    return *this;
+  }
+
+  inline int32_t ZeroPoint() const { return zero_point_; }
+
+  inline QuantizedResizeBilinearTester& Scale(float scale) {
+    scale_ = scale;
+    return *this;
+  }
+
+  inline float Scale() const { return scale_; }
+
+  inline QuantizedResizeBilinearTester& Unsigned(bool is_unsigned) {
+    unsigned_ = is_unsigned;
+    return *this;
+  }
+
+  inline bool Unsigned() const { return unsigned_; }
+
+  template <class T>
+  void Test(Interpreter* delegate_interpreter,
+            Interpreter* default_interpreter) const;
+
+  void Test(TfLiteDelegate* delegate) const;
+
+ private:
+  std::vector<char> CreateTfLiteModel() const;
+
+  int32_t batch_size_ = 1;
+  int32_t channels_ = 1;
+  int32_t input_height_ = 1;
+  int32_t input_width_ = 1;
+  int32_t output_height_ = 1;
+  int32_t output_width_ = 1;
+  bool align_corners_ = false;
+  bool half_pixel_centers_ = false;
+  int32_t zero_point_ = 2;
+  float scale_ = 0.75f;
+  bool unsigned_ = false;
+};
+
+}  // namespace xnnpack
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_RESIZE_BILINEAR_TESTER_H_
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.cc b/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.cc
index 7ad2c68ef5c3d6..66dd503cf7ee4f 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.cc
@@ -259,7 +259,7 @@ std::vector<char> QuantizedTransposeConvTester::CreateTfLiteModel() const {
       builder, &tensors, &subgraph_inputs, &subgraph_outputs, &operators);
 
   flatbuffers::Offset<flatbuffers::String> description =
-      builder.CreateString("TransposeConv model");
+      builder.CreateString("Quantized TransposeConv model");
 
   flatbuffers::Offset<Model> model_buffer = CreateModel(
       builder, TFLITE_SCHEMA_VERSION,
diff --git a/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.h b/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.h
index af852a916d91b0..2146ba6f6b46a8 100644
--- a/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/quantized_transpose_conv_tester.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_TRANSPOSE_CONV_TESTER_H_
-#define TENSORFLOW_LITE_DELEGATES_XNNPACK_TRANSPOSE_CONV_TESTER_H_
+#ifndef TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_TRANSPOSE_CONV_TESTER_H_
+#define TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_TRANSPOSE_CONV_TESTER_H_
 
 #include <cstdint>
 #include <functional>
@@ -221,4 +221,4 @@ class QuantizedTransposeConvTester {
 }  // namespace xnnpack
 }  // namespace tflite
 
-#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_TRANSPOSE_CONV_TESTER_H_
+#endif  // TENSORFLOW_LITE_DELEGATES_XNNPACK_QUANTIZED_TRANSPOSE_CONV_TESTER_H_
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_mean_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_mean_test.cc
new file mode 100644
index 00000000000000..6c5c85ac253a93
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_mean_test.cc
@@ -0,0 +1,501 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/quantized_reduce_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(SignedQuantizedMean, DISABLED_4DReduceBatchSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({0})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, DISABLED_4DReduceBatchKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({0})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, DISABLED_4DReduceHeightSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({1})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, DISABLED_4DReduceHeightKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({1})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, DISABLED_4DReduceWidthSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({2})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, DISABLED_4DReduceWidthKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({2})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, 4DReduceHeightWidthSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({1, 2})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+
+  QuantizedReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({2, 1})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, 4DReduceHeightWidthKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({1, 2})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+
+  QuantizedReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({2, 1})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, DISABLED_4DReduceChannelsSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({3})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, DISABLED_4DReduceChannelsKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({3})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, DISABLED_3DReduceBatchSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .InputShape({batch, width, channels})
+      .Axes({0})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, DISABLED_3DReduceBatchKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .InputShape({batch, width, channels})
+      .Axes({0})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, DISABLED_3DReduceWidthSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .InputShape({batch, width, channels})
+      .Axes({1})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, DISABLED_3DReduceWidthKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .InputShape({batch, width, channels})
+      .Axes({1})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, DISABLED_3DReduceChannelsSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .InputShape({batch, width, channels})
+      .Axes({2})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, DISABLED_3DReduceChannelsKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .InputShape({batch, width, channels})
+      .Axes({2})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, DISABLED_2DReduceBatchSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .InputShape({batch, channels})
+      .Axes({0})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, DISABLED_2DReduceBatchKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .InputShape({batch, channels})
+      .Axes({0})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, DISABLED_2DReduceChannelsSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .InputShape({batch, channels})
+      .Axes({1})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, DISABLED_2DReduceChannelsKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .InputShape({batch, channels})
+      .Axes({1})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, DISABLED_1DSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+
+  QuantizedReduceTester().InputShape({batch}).Axes({0}).KeepDims(false).Test(
+      BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, DISABLED_1DKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+
+  QuantizedReduceTester().InputShape({batch}).Axes({0}).KeepDims(true).Test(
+      BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedMean, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .InputShape({batch, height, width, channels})
+      .Axes({1, 2})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/signed_quantized_resize_bilinear_test.cc b/tensorflow/lite/delegates/xnnpack/signed_quantized_resize_bilinear_test.cc
new file mode 100644
index 00000000000000..8c77ba185f552a
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/signed_quantized_resize_bilinear_test.cc
@@ -0,0 +1,119 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(SignedQuantizedResizeBilinear, AlignCenters) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto size_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
+
+  QuantizedResizeBilinearTester()
+      .HalfPixelCenters(true)
+      .InputHeight(size_rng())
+      .InputWidth(size_rng())
+      .OutputHeight(size_rng())
+      .OutputWidth(size_rng())
+      .Channels(channel_rng())
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedResizeBilinear, AlignCentersTF1X) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto size_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
+
+  QuantizedResizeBilinearTester()
+      .InputHeight(size_rng())
+      .InputWidth(size_rng())
+      .OutputHeight(size_rng())
+      .OutputWidth(size_rng())
+      .Channels(channel_rng())
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedResizeBilinear, AlignCorners) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto size_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
+
+  QuantizedResizeBilinearTester()
+      .AlignCorners(true)
+      .InputHeight(size_rng())
+      .InputWidth(size_rng())
+      .OutputHeight(size_rng())
+      .OutputWidth(size_rng())
+      .Channels(channel_rng())
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(SignedQuantizedResizeBilinear, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto size_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
+
+  QuantizedResizeBilinearTester()
+      .InputHeight(size_rng())
+      .InputWidth(size_rng())
+      .OutputHeight(size_rng())
+      .OutputWidth(size_rng())
+      .Channels(channel_rng())
+      .Test(xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_mean_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_mean_test.cc
new file mode 100644
index 00000000000000..01fc70756e106c
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_mean_test.cc
@@ -0,0 +1,528 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/quantized_reduce_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(UnsignedQuantizedMean, DISABLED_4DReduceBatchSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, height, width, channels})
+      .Axes({0})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, DISABLED_4DReduceBatchKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, height, width, channels})
+      .Axes({0})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, DISABLED_4DReduceHeightSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, height, width, channels})
+      .Axes({1})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, DISABLED_4DReduceHeightKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, height, width, channels})
+      .Axes({1})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, DISABLED_4DReduceWidthSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, height, width, channels})
+      .Axes({2})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, DISABLED_4DReduceWidthKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, height, width, channels})
+      .Axes({2})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, 4DReduceHeightWidthSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, height, width, channels})
+      .Axes({1, 2})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, height, width, channels})
+      .Axes({2, 1})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, 4DReduceHeightWidthKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, height, width, channels})
+      .Axes({1, 2})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, height, width, channels})
+      .Axes({2, 1})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, DISABLED_4DReduceChannelsSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, height, width, channels})
+      .Axes({3})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, DISABLED_4DReduceChannelsKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, height, width, channels})
+      .Axes({3})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, DISABLED_3DReduceBatchSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, width, channels})
+      .Axes({0})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, DISABLED_3DReduceBatchKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, width, channels})
+      .Axes({0})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, DISABLED_3DReduceWidthSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, width, channels})
+      .Axes({1})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, DISABLED_3DReduceWidthKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, width, channels})
+      .Axes({1})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, DISABLED_3DReduceChannelsSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, width, channels})
+      .Axes({2})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, DISABLED_3DReduceChannelsKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, width, channels})
+      .Axes({2})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, DISABLED_2DReduceBatchSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, channels})
+      .Axes({0})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, DISABLED_2DReduceBatchKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, channels})
+      .Axes({0})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, DISABLED_2DReduceChannelsSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, channels})
+      .Axes({1})
+      .KeepDims(false)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, DISABLED_2DReduceChannelsKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, channels})
+      .Axes({1})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, DISABLED_1DSqueezeDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+
+  QuantizedReduceTester().InputShape({batch}).Axes({0}).KeepDims(false).Test(
+      BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, DISABLED_1DKeepDims) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch})
+      .Axes({0})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedMean, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  QuantizedReduceTester()
+      .Unsigned(true)
+      .InputShape({batch, height, width, channels})
+      .Axes({1, 2})
+      .KeepDims(true)
+      .Test(BuiltinOperator_MEAN, xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/unsigned_quantized_resize_bilinear_test.cc b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_resize_bilinear_test.cc
new file mode 100644
index 00000000000000..aea92fd94bd892
--- /dev/null
+++ b/tensorflow/lite/delegates/xnnpack/unsigned_quantized_resize_bilinear_test.cc
@@ -0,0 +1,123 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <random>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/delegates/xnnpack/quantized_resize_bilinear_tester.h"
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+
+namespace tflite {
+namespace xnnpack {
+
+TEST(UnsignedQuantizedResizeBilinear, AlignCenters) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto size_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
+
+  QuantizedResizeBilinearTester()
+      .Unsigned(true)
+      .HalfPixelCenters(true)
+      .InputHeight(size_rng())
+      .InputWidth(size_rng())
+      .OutputHeight(size_rng())
+      .OutputWidth(size_rng())
+      .Channels(channel_rng())
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedResizeBilinear, AlignCentersTF1X) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto size_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
+
+  QuantizedResizeBilinearTester()
+      .Unsigned(true)
+      .InputHeight(size_rng())
+      .InputWidth(size_rng())
+      .OutputHeight(size_rng())
+      .OutputWidth(size_rng())
+      .Channels(channel_rng())
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedResizeBilinear, AlignCorners) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto size_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
+
+  QuantizedResizeBilinearTester()
+      .Unsigned(true)
+      .AlignCorners(true)
+      .InputHeight(size_rng())
+      .InputWidth(size_rng())
+      .OutputHeight(size_rng())
+      .OutputWidth(size_rng())
+      .Channels(channel_rng())
+      .Test(xnnpack_delegate.get());
+}
+
+TEST(UnsignedQuantizedResizeBilinear, MultiThreading) {
+  TfLiteXNNPackDelegateOptions delegate_options =
+      TfLiteXNNPackDelegateOptionsDefault();
+  delegate_options.num_threads = 2;
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(&delegate_options),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto size_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 10), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 16), std::ref(rng));
+
+  QuantizedResizeBilinearTester()
+      .Unsigned(true)
+      .InputHeight(size_rng())
+      .InputWidth(size_rng())
+      .OutputHeight(size_rng())
+      .OutputWidth(size_rng())
+      .Channels(channel_rng())
+      .Test(xnnpack_delegate.get());
+}
+
+}  // namespace xnnpack
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index edef1964a7f83c..547a138a554847 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -2587,7 +2587,7 @@ class Subgraph {
         CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
 
     const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQUInt8Type(
         logging_context, input_tensor, node->inputs->data[0], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 4,
                                            node->inputs->data[0]));
@@ -2625,7 +2625,7 @@ class Subgraph {
     }
 
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQUInt8Type(
         logging_context, output_tensor, node->outputs->data[0], node_index));
     const int expected_output_dims = reducer_params->keep_dims ? 4 : 2;
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor,
@@ -3266,7 +3266,7 @@ class Subgraph {
         CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
 
     const TfLiteTensor& input_tensor = tensors[node->inputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQUInt8Type(
         logging_context, input_tensor, node->inputs->data[0], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, input_tensor, 4,
                                            node->inputs->data[0]));
@@ -3289,7 +3289,7 @@ class Subgraph {
         logging_context, shape_tensor, node->inputs->data[1], node_index));
 
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
-    TF_LITE_ENSURE_STATUS(CheckTensorFloat32Type(
+    TF_LITE_ENSURE_STATUS(CheckTensorFloat32OrQUInt8Type(
         logging_context, output_tensor, node->outputs->data[0], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 4,
                                            node->outputs->data[0]));
diff --git a/tensorflow/lite/experimental/acceleration/configuration/configuration.proto b/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
index ff8afc75a26271..ac81be792e615c 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
+++ b/tensorflow/lite/experimental/acceleration/configuration/configuration.proto
@@ -332,6 +332,12 @@ message EdgeTpuSettings {
     HALF = 3;
   }
 
+  enum QosClass {
+    QOS_UNDEFINED = 0;
+    BEST_EFFORT = 1;
+    REALTIME = 2;
+  }
+
   // Target inference power state for running the model.
   optional EdgeTpuPowerState inference_power_state = 1;
 
@@ -349,6 +355,9 @@ message EdgeTpuSettings {
 
   // Float truncation type for EdgeTPU.
   optional FloatTruncationType float_truncation_type = 6;
+
+  // QoS class to determine chunking size for PRO onward.
+  optional QosClass qos_class = 7 [default = QOS_UNDEFINED];
 }
 
 // Coral Dev Board / USB accelerator delegate settings.
diff --git a/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h b/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h
index f2921955d3c4c7..c5340592d132ee 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h
@@ -516,6 +516,39 @@ inline const char *EnumNameFloatTruncationType(FloatTruncationType e) {
   return EnumNamesFloatTruncationType()[index];
 }
 
+enum QosClass {
+  QosClass_QOS_UNDEFINED = 0,
+  QosClass_BEST_EFFORT = 1,
+  QosClass_REALTIME = 2,
+  QosClass_MIN = QosClass_QOS_UNDEFINED,
+  QosClass_MAX = QosClass_REALTIME
+};
+
+inline const QosClass (&EnumValuesQosClass())[3] {
+  static const QosClass values[] = {
+    QosClass_QOS_UNDEFINED,
+    QosClass_BEST_EFFORT,
+    QosClass_REALTIME
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesQosClass() {
+  static const char * const names[4] = {
+    "QOS_UNDEFINED",
+    "BEST_EFFORT",
+    "REALTIME",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameQosClass(QosClass e) {
+  if (flatbuffers::IsOutRange(e, QosClass_QOS_UNDEFINED, QosClass_REALTIME)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesQosClass()[index];
+}
+
 }  // namespace EdgeTpuSettings_
 
 namespace CoralSettings_ {
@@ -774,12 +807,12 @@ struct NNAPISettingsT : public flatbuffers::NativeTable {
       : execution_preference(tflite::NNAPIExecutionPreference_UNDEFINED),
         no_of_nnapi_instances_to_cache(0),
         allow_nnapi_cpu_on_android_10_plus(false),
-        execution_priority(
-            tflite::NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED),
+        execution_priority(tflite::NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED),
         allow_dynamic_dimensions(false),
         allow_fp16_precision_for_fp32(false),
         use_burst_computation(false),
-        support_library_handle(0) {}
+        support_library_handle(0) {
+  }
 };
 
 struct NNAPISettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
@@ -846,8 +879,7 @@ struct NNAPISettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyField<int32_t>(verifier, VT_NO_OF_NNAPI_INSTANCES_TO_CACHE) &&
            VerifyOffset(verifier, VT_FALLBACK_SETTINGS) &&
            verifier.VerifyTable(fallback_settings()) &&
-           VerifyField<uint8_t>(verifier,
-                                VT_ALLOW_NNAPI_CPU_ON_ANDROID_10_PLUS) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_NNAPI_CPU_ON_ANDROID_10_PLUS) &&
            VerifyField<int32_t>(verifier, VT_EXECUTION_PRIORITY) &&
            VerifyField<uint8_t>(verifier, VT_ALLOW_DYNAMIC_DIMENSIONS) &&
            VerifyField<uint8_t>(verifier, VT_ALLOW_FP16_PRECISION_FOR_FP32) &&
@@ -897,8 +929,7 @@ struct NNAPISettingsBuilder {
     fbb_.AddElement<uint8_t>(NNAPISettings::VT_USE_BURST_COMPUTATION, static_cast<uint8_t>(use_burst_computation), 0);
   }
   void add_support_library_handle(int64_t support_library_handle) {
-    fbb_.AddElement<int64_t>(NNAPISettings::VT_SUPPORT_LIBRARY_HANDLE,
-                             support_library_handle, 0);
+    fbb_.AddElement<int64_t>(NNAPISettings::VT_SUPPORT_LIBRARY_HANDLE, support_library_handle, 0);
   }
   explicit NNAPISettingsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
@@ -917,16 +948,15 @@ inline flatbuffers::Offset<NNAPISettings> CreateNNAPISettings(
     flatbuffers::Offset<flatbuffers::String> accelerator_name = 0,
     flatbuffers::Offset<flatbuffers::String> cache_directory = 0,
     flatbuffers::Offset<flatbuffers::String> model_token = 0,
-    tflite::NNAPIExecutionPreference execution_preference =
-        tflite::NNAPIExecutionPreference_UNDEFINED,
+    tflite::NNAPIExecutionPreference execution_preference = tflite::NNAPIExecutionPreference_UNDEFINED,
     int32_t no_of_nnapi_instances_to_cache = 0,
     flatbuffers::Offset<tflite::FallbackSettings> fallback_settings = 0,
     bool allow_nnapi_cpu_on_android_10_plus = false,
-    tflite::NNAPIExecutionPriority execution_priority =
-        tflite::NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED,
+    tflite::NNAPIExecutionPriority execution_priority = tflite::NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED,
     bool allow_dynamic_dimensions = false,
     bool allow_fp16_precision_for_fp32 = false,
-    bool use_burst_computation = false, int64_t support_library_handle = 0) {
+    bool use_burst_computation = false,
+    int64_t support_library_handle = 0) {
   NNAPISettingsBuilder builder_(_fbb);
   builder_.add_support_library_handle(support_library_handle);
   builder_.add_execution_priority(execution_priority);
@@ -946,26 +976,34 @@ inline flatbuffers::Offset<NNAPISettings> CreateNNAPISettings(
 inline flatbuffers::Offset<NNAPISettings> CreateNNAPISettingsDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
     const char *accelerator_name = nullptr,
-    const char *cache_directory = nullptr, const char *model_token = nullptr,
-    tflite::NNAPIExecutionPreference execution_preference =
-        tflite::NNAPIExecutionPreference_UNDEFINED,
+    const char *cache_directory = nullptr,
+    const char *model_token = nullptr,
+    tflite::NNAPIExecutionPreference execution_preference = tflite::NNAPIExecutionPreference_UNDEFINED,
     int32_t no_of_nnapi_instances_to_cache = 0,
     flatbuffers::Offset<tflite::FallbackSettings> fallback_settings = 0,
     bool allow_nnapi_cpu_on_android_10_plus = false,
-    tflite::NNAPIExecutionPriority execution_priority =
-        tflite::NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED,
+    tflite::NNAPIExecutionPriority execution_priority = tflite::NNAPIExecutionPriority_NNAPI_PRIORITY_UNDEFINED,
     bool allow_dynamic_dimensions = false,
     bool allow_fp16_precision_for_fp32 = false,
-    bool use_burst_computation = false, int64_t support_library_handle = 0) {
+    bool use_burst_computation = false,
+    int64_t support_library_handle = 0) {
   auto accelerator_name__ = accelerator_name ? _fbb.CreateString(accelerator_name) : 0;
   auto cache_directory__ = cache_directory ? _fbb.CreateString(cache_directory) : 0;
   auto model_token__ = model_token ? _fbb.CreateString(model_token) : 0;
   return tflite::CreateNNAPISettings(
-      _fbb, accelerator_name__, cache_directory__, model_token__,
-      execution_preference, no_of_nnapi_instances_to_cache, fallback_settings,
-      allow_nnapi_cpu_on_android_10_plus, execution_priority,
-      allow_dynamic_dimensions, allow_fp16_precision_for_fp32,
-      use_burst_computation, support_library_handle);
+      _fbb,
+      accelerator_name__,
+      cache_directory__,
+      model_token__,
+      execution_preference,
+      no_of_nnapi_instances_to_cache,
+      fallback_settings,
+      allow_nnapi_cpu_on_android_10_plus,
+      execution_priority,
+      allow_dynamic_dimensions,
+      allow_fp16_precision_for_fp32,
+      use_burst_computation,
+      support_library_handle);
 }
 
 flatbuffers::Offset<NNAPISettings> CreateNNAPISettings(flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -1470,10 +1508,12 @@ struct EdgeTpuSettingsT : public flatbuffers::NativeTable {
   std::unique_ptr<tflite::EdgeTpuDeviceSpecT> edgetpu_device_spec;
   std::string model_token;
   tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type;
+  tflite::EdgeTpuSettings_::QosClass qos_class;
   EdgeTpuSettingsT()
       : inference_power_state(tflite::EdgeTpuPowerState_UNDEFINED_POWERSTATE),
         inference_priority(-1),
-        float_truncation_type(tflite::EdgeTpuSettings_::FloatTruncationType_UNSPECIFIED) {
+        float_truncation_type(tflite::EdgeTpuSettings_::FloatTruncationType_UNSPECIFIED),
+        qos_class(tflite::EdgeTpuSettings_::QosClass_QOS_UNDEFINED) {
   }
 };
 
@@ -1485,7 +1525,8 @@ struct EdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_INFERENCE_PRIORITY = 8,
     VT_EDGETPU_DEVICE_SPEC = 10,
     VT_MODEL_TOKEN = 12,
-    VT_FLOAT_TRUNCATION_TYPE = 14
+    VT_FLOAT_TRUNCATION_TYPE = 14,
+    VT_QOS_CLASS = 16
   };
   tflite::EdgeTpuPowerState inference_power_state() const {
     return static_cast<tflite::EdgeTpuPowerState>(GetField<int32_t>(VT_INFERENCE_POWER_STATE, 0));
@@ -1505,6 +1546,9 @@ struct EdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type() const {
     return static_cast<tflite::EdgeTpuSettings_::FloatTruncationType>(GetField<int32_t>(VT_FLOAT_TRUNCATION_TYPE, 0));
   }
+  tflite::EdgeTpuSettings_::QosClass qos_class() const {
+    return static_cast<tflite::EdgeTpuSettings_::QosClass>(GetField<int32_t>(VT_QOS_CLASS, 0));
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_INFERENCE_POWER_STATE) &&
@@ -1517,6 +1561,7 @@ struct EdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyOffset(verifier, VT_MODEL_TOKEN) &&
            verifier.VerifyString(model_token()) &&
            VerifyField<int32_t>(verifier, VT_FLOAT_TRUNCATION_TYPE) &&
+           VerifyField<int32_t>(verifier, VT_QOS_CLASS) &&
            verifier.EndTable();
   }
   EdgeTpuSettingsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -1545,6 +1590,9 @@ struct EdgeTpuSettingsBuilder {
   void add_float_truncation_type(tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type) {
     fbb_.AddElement<int32_t>(EdgeTpuSettings::VT_FLOAT_TRUNCATION_TYPE, static_cast<int32_t>(float_truncation_type), 0);
   }
+  void add_qos_class(tflite::EdgeTpuSettings_::QosClass qos_class) {
+    fbb_.AddElement<int32_t>(EdgeTpuSettings::VT_QOS_CLASS, static_cast<int32_t>(qos_class), 0);
+  }
   explicit EdgeTpuSettingsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -1564,8 +1612,10 @@ inline flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(
     int32_t inference_priority = -1,
     flatbuffers::Offset<tflite::EdgeTpuDeviceSpec> edgetpu_device_spec = 0,
     flatbuffers::Offset<flatbuffers::String> model_token = 0,
-    tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type = tflite::EdgeTpuSettings_::FloatTruncationType_UNSPECIFIED) {
+    tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type = tflite::EdgeTpuSettings_::FloatTruncationType_UNSPECIFIED,
+    tflite::EdgeTpuSettings_::QosClass qos_class = tflite::EdgeTpuSettings_::QosClass_QOS_UNDEFINED) {
   EdgeTpuSettingsBuilder builder_(_fbb);
+  builder_.add_qos_class(qos_class);
   builder_.add_float_truncation_type(float_truncation_type);
   builder_.add_model_token(model_token);
   builder_.add_edgetpu_device_spec(edgetpu_device_spec);
@@ -1582,7 +1632,8 @@ inline flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettingsDirect(
     int32_t inference_priority = -1,
     flatbuffers::Offset<tflite::EdgeTpuDeviceSpec> edgetpu_device_spec = 0,
     const char *model_token = nullptr,
-    tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type = tflite::EdgeTpuSettings_::FloatTruncationType_UNSPECIFIED) {
+    tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type = tflite::EdgeTpuSettings_::FloatTruncationType_UNSPECIFIED,
+    tflite::EdgeTpuSettings_::QosClass qos_class = tflite::EdgeTpuSettings_::QosClass_QOS_UNDEFINED) {
   auto inactive_power_configs__ = inactive_power_configs ? _fbb.CreateVector<flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>>(*inactive_power_configs) : 0;
   auto model_token__ = model_token ? _fbb.CreateString(model_token) : 0;
   return tflite::CreateEdgeTpuSettings(
@@ -1592,7 +1643,8 @@ inline flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettingsDirect(
       inference_priority,
       edgetpu_device_spec,
       model_token__,
-      float_truncation_type);
+      float_truncation_type,
+      qos_class);
 }
 
 flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -3048,23 +3100,19 @@ inline flatbuffers::Offset<ComputeSettings> CreateComputeSettings(flatbuffers::F
 
 
 inline bool operator==(const NNAPISettingsT &lhs, const NNAPISettingsT &rhs) {
-  return (lhs.accelerator_name == rhs.accelerator_name) &&
-         (lhs.cache_directory == rhs.cache_directory) &&
-         (lhs.model_token == rhs.model_token) &&
-         (lhs.execution_preference == rhs.execution_preference) &&
-         (lhs.no_of_nnapi_instances_to_cache ==
-          rhs.no_of_nnapi_instances_to_cache) &&
-         ((lhs.fallback_settings == rhs.fallback_settings) ||
-          (lhs.fallback_settings && rhs.fallback_settings &&
-           *lhs.fallback_settings == *rhs.fallback_settings)) &&
-         (lhs.allow_nnapi_cpu_on_android_10_plus ==
-          rhs.allow_nnapi_cpu_on_android_10_plus) &&
-         (lhs.execution_priority == rhs.execution_priority) &&
-         (lhs.allow_dynamic_dimensions == rhs.allow_dynamic_dimensions) &&
-         (lhs.allow_fp16_precision_for_fp32 ==
-          rhs.allow_fp16_precision_for_fp32) &&
-         (lhs.use_burst_computation == rhs.use_burst_computation) &&
-         (lhs.support_library_handle == rhs.support_library_handle);
+  return
+      (lhs.accelerator_name == rhs.accelerator_name) &&
+      (lhs.cache_directory == rhs.cache_directory) &&
+      (lhs.model_token == rhs.model_token) &&
+      (lhs.execution_preference == rhs.execution_preference) &&
+      (lhs.no_of_nnapi_instances_to_cache == rhs.no_of_nnapi_instances_to_cache) &&
+      ((lhs.fallback_settings == rhs.fallback_settings) || (lhs.fallback_settings && rhs.fallback_settings && *lhs.fallback_settings == *rhs.fallback_settings)) &&
+      (lhs.allow_nnapi_cpu_on_android_10_plus == rhs.allow_nnapi_cpu_on_android_10_plus) &&
+      (lhs.execution_priority == rhs.execution_priority) &&
+      (lhs.allow_dynamic_dimensions == rhs.allow_dynamic_dimensions) &&
+      (lhs.allow_fp16_precision_for_fp32 == rhs.allow_fp16_precision_for_fp32) &&
+      (lhs.use_burst_computation == rhs.use_burst_computation) &&
+      (lhs.support_library_handle == rhs.support_library_handle);
 }
 
 inline bool operator!=(const NNAPISettingsT &lhs, const NNAPISettingsT &rhs) {
@@ -3092,10 +3140,7 @@ inline void NNAPISettings::UnPackTo(NNAPISettingsT *_o, const flatbuffers::resol
   { auto _e = allow_dynamic_dimensions(); _o->allow_dynamic_dimensions = _e; }
   { auto _e = allow_fp16_precision_for_fp32(); _o->allow_fp16_precision_for_fp32 = _e; }
   { auto _e = use_burst_computation(); _o->use_burst_computation = _e; }
-  {
-    auto _e = support_library_handle();
-    _o->support_library_handle = _e;
-  }
+  { auto _e = support_library_handle(); _o->support_library_handle = _e; }
 }
 
 inline flatbuffers::Offset<NNAPISettings> NNAPISettings::Pack(flatbuffers::FlatBufferBuilder &_fbb, const NNAPISettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -3119,11 +3164,18 @@ inline flatbuffers::Offset<NNAPISettings> CreateNNAPISettings(flatbuffers::FlatB
   auto _use_burst_computation = _o->use_burst_computation;
   auto _support_library_handle = _o->support_library_handle;
   return tflite::CreateNNAPISettings(
-      _fbb, _accelerator_name, _cache_directory, _model_token,
-      _execution_preference, _no_of_nnapi_instances_to_cache,
-      _fallback_settings, _allow_nnapi_cpu_on_android_10_plus,
-      _execution_priority, _allow_dynamic_dimensions,
-      _allow_fp16_precision_for_fp32, _use_burst_computation,
+      _fbb,
+      _accelerator_name,
+      _cache_directory,
+      _model_token,
+      _execution_preference,
+      _no_of_nnapi_instances_to_cache,
+      _fallback_settings,
+      _allow_nnapi_cpu_on_android_10_plus,
+      _execution_priority,
+      _allow_dynamic_dimensions,
+      _allow_fp16_precision_for_fp32,
+      _use_burst_computation,
       _support_library_handle);
 }
 
@@ -3380,7 +3432,8 @@ inline bool operator==(const EdgeTpuSettingsT &lhs, const EdgeTpuSettingsT &rhs)
       (lhs.inference_priority == rhs.inference_priority) &&
       ((lhs.edgetpu_device_spec == rhs.edgetpu_device_spec) || (lhs.edgetpu_device_spec && rhs.edgetpu_device_spec && *lhs.edgetpu_device_spec == *rhs.edgetpu_device_spec)) &&
       (lhs.model_token == rhs.model_token) &&
-      (lhs.float_truncation_type == rhs.float_truncation_type);
+      (lhs.float_truncation_type == rhs.float_truncation_type) &&
+      (lhs.qos_class == rhs.qos_class);
 }
 
 inline bool operator!=(const EdgeTpuSettingsT &lhs, const EdgeTpuSettingsT &rhs) {
@@ -3403,6 +3456,7 @@ inline void EdgeTpuSettings::UnPackTo(EdgeTpuSettingsT *_o, const flatbuffers::r
   { auto _e = edgetpu_device_spec(); if (_e) _o->edgetpu_device_spec = std::unique_ptr<tflite::EdgeTpuDeviceSpecT>(_e->UnPack(_resolver)); }
   { auto _e = model_token(); if (_e) _o->model_token = _e->str(); }
   { auto _e = float_truncation_type(); _o->float_truncation_type = _e; }
+  { auto _e = qos_class(); _o->qos_class = _e; }
 }
 
 inline flatbuffers::Offset<EdgeTpuSettings> EdgeTpuSettings::Pack(flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -3419,6 +3473,7 @@ inline flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(flatbuffers::F
   auto _edgetpu_device_spec = _o->edgetpu_device_spec ? CreateEdgeTpuDeviceSpec(_fbb, _o->edgetpu_device_spec.get(), _rehasher) : 0;
   auto _model_token = _o->model_token.empty() ? 0 : _fbb.CreateString(_o->model_token);
   auto _float_truncation_type = _o->float_truncation_type;
+  auto _qos_class = _o->qos_class;
   return tflite::CreateEdgeTpuSettings(
       _fbb,
       _inference_power_state,
@@ -3426,7 +3481,8 @@ inline flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(flatbuffers::F
       _inference_priority,
       _edgetpu_device_spec,
       _model_token,
-      _float_truncation_type);
+      _float_truncation_type,
+      _qos_class);
 }
 
 
diff --git a/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc b/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc
index 9a22ad7ebcb28c..2d9020eab9d889 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc
+++ b/tensorflow/lite/experimental/acceleration/configuration/proto_to_flatbuffer.cc
@@ -298,7 +298,8 @@ Offset<EdgeTpuSettings> ConvertEdgeTpuSettings(
       inactive_power_configs, settings.inference_priority(),
       edgetpu_device_spec, model_token,
       static_cast<tflite::EdgeTpuSettings_::FloatTruncationType>(
-          settings.float_truncation_type()));
+          settings.float_truncation_type()),
+      static_cast<tflite::EdgeTpuSettings_::QosClass>(settings.qos_class()));
 }
 
 Offset<CoralSettings> ConvertCoralSettings(const proto::CoralSettings& settings,
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/set_big_core_affinity.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/set_big_core_affinity.cc
index 83904fbbfb70d8..3930245233cd03 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/set_big_core_affinity.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/set_big_core_affinity.cc
@@ -25,6 +25,7 @@ namespace tflite {
 namespace acceleration {
 
 int32_t SetBigCoresAffinity() {
+#ifdef __ANDROID__
   ::tflite::acceleration::BigLittleAffinity affinity =
       ::tflite::acceleration::GetAffinity();
 
@@ -35,11 +36,14 @@ int32_t SetBigCoresAffinity() {
       CPU_SET(i, &set);
     }
   }
-  if (sched_setaffinity(getpid(), sizeof(set), &set) == -1) {
+  if (sched_setaffinity(getpid(), sizeof(set), &set) != -1) {
     return 0;
   } else {
     return errno;
   }
+#else  // !__ANDROID__
+  return 0;
+#endif
 }
 
 }  // namespace acceleration
diff --git a/tensorflow/lite/g3doc/examples/on_device_training/overview.ipynb b/tensorflow/lite/g3doc/examples/on_device_training/overview.ipynb
index f86c7ac34623d9..dca71fdfa038a9 100644
--- a/tensorflow/lite/g3doc/examples/on_device_training/overview.ipynb
+++ b/tensorflow/lite/g3doc/examples/on_device_training/overview.ipynb
@@ -92,8 +92,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "9j4MGqyKQEo4",
-        "outputId": "efe14d2f-ed36-483a-aee6-f3d8d3d5edee"
+        "id": "9j4MGqyKQEo4"
       },
       "outputs": [
         {
@@ -174,15 +173,12 @@
         "    self.model = tf.keras.Sequential([\n",
         "        tf.keras.layers.Flatten(input_shape=(IMG_SIZE, IMG_SIZE), name='flatten'),\n",
         "        tf.keras.layers.Dense(128, activation='relu', name='dense_1'),\n",
-        "        tf.keras.layers.Dense(10, activation='softmax', name='dense_2')\n",
+        "        tf.keras.layers.Dense(10, name='dense_2')\n",
         "    ])\n",
         "\n",
         "    self.model.compile(\n",
         "        optimizer='sgd',\n",
-        "        loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),\n",
-        "        metrics=['accuracy'])\n",
-        "    self._LOSS_FN = tf.keras.losses.CategoricalCrossentropy()\n",
-        "    self._OPTIM = tf.optimizers.SGD()\n",
+        "        loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True))\n",
         "\n",
         "  # The `train` function takes a batch of input images and labels.\n",
         "  @tf.function(input_signature=[\n",
@@ -192,21 +188,22 @@
         "  def train(self, x, y):\n",
         "    with tf.GradientTape() as tape:\n",
         "      prediction = self.model(x)\n",
-        "      loss = self._LOSS_FN(prediction, y)\n",
+        "      loss = self.model.loss(y, prediction)\n",
         "    gradients = tape.gradient(loss, self.model.trainable_variables)\n",
-        "    self._OPTIM.apply_gradients(\n",
+        "    self.model.optimizer.apply_gradients(\n",
         "        zip(gradients, self.model.trainable_variables))\n",
         "    result = {\"loss\": loss}\n",
-        "    for grad in gradients:\n",
-        "      result[grad.name] = grad\n",
         "    return result\n",
         "\n",
         "  @tf.function(input_signature=[\n",
         "      tf.TensorSpec([None, IMG_SIZE, IMG_SIZE], tf.float32),\n",
         "  ])\n",
         "  def infer(self, x):\n",
+        "    logits = self.model(x)\n",
+        "    probabilities = tf.nn.softmax(logits, axis=-1)\n",
         "    return {\n",
-        "        \"output\": self.model(x)\n",
+        "        \"output\": probabilities,\n",
+        "        \"logits\": logits\n",
         "    }\n",
         "\n",
         "  @tf.function(input_signature=[tf.TensorSpec(shape=[], dtype=tf.string)])\n",
@@ -240,7 +237,9 @@
       "source": [
         "The `train` function in the code above uses the [GradientTape](https://www.tensorflow.org/api_docs/python/tf/GradientTape) class to record operations for automatic differentiation. For more information on how to use this class, see the [Introduction to gradients and automatic differentiation](https://www.tensorflow.org/guide/autodiff).\n",
         "\n",
-        "Note: The weights generated by this model are serialized as a TensorFlow version one checkpoint file format."
+        "You could use the `Model.train_step` method of the keras model here instead of a from-scratch implementation. Just note that the loss (and metrics) returned by `Model.train_step` is the running average, and should be reset regularly (typically each epoch). See [Customize Model.fit](https://www.tensorflow.org/guide/keras/customizing_what_happens_in_fit) for details.\n",
+        "\n",
+        "Note: The weights generated by this model are serialized into a TensorFlow 1 format checkpoint file."
       ]
     },
     {
@@ -285,8 +284,8 @@
       },
       "outputs": [],
       "source": [
-        "train_images = train_images / 255.0\n",
-        "test_images = test_images / 255.0"
+        "train_images = (train_images / 255.0).astype(np.float32)\n",
+        "test_images = (test_images / 255.0).astype(np.float32)"
       ]
     },
     {
@@ -334,35 +333,44 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "Diwn1MmkNVeX",
-        "outputId": "b478ccac-2daf-45c8-d2f0-5b4c90ee60c4"
+        "id": "Diwn1MmkNVeX"
       },
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "Finished 10 epochs, current loss: 6.009137153625488\n",
-            "Finished 20 epochs, current loss: 5.951833724975586\n",
-            "Finished 30 epochs, current loss: 5.899169921875\n",
-            "Finished 40 epochs, current loss: 5.837289810180664\n",
-            "Finished 50 epochs, current loss: 5.809184551239014\n",
-            "Finished 60 epochs, current loss: 5.765789031982422\n",
-            "Finished 70 epochs, current loss: 5.694704055786133\n",
-            "Finished 80 epochs, current loss: 5.667092800140381\n",
-            "Finished 90 epochs, current loss: 5.664350509643555\n",
-            "Finished 100 epochs, current loss: 5.654796123504639\n"
+            "Finished 10 epochs\n",
+            "  loss: 0.428\n",
+            "Finished 20 epochs\n",
+            "  loss: 0.378\n",
+            "Finished 30 epochs\n",
+            "  loss: 0.344\n",
+            "Finished 40 epochs\n",
+            "  loss: 0.317\n",
+            "Finished 50 epochs\n",
+            "  loss: 0.299\n",
+            "Finished 60 epochs\n",
+            "  loss: 0.283\n",
+            "Finished 70 epochs\n",
+            "  loss: 0.266\n",
+            "Finished 80 epochs\n",
+            "  loss: 0.252\n",
+            "Finished 90 epochs\n",
+            "  loss: 0.240\n",
+            "Finished 100 epochs\n",
+            "  loss: 0.230\n"
           ]
         },
         {
           "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD8CAYAAABn919SAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90\nbGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsT\nAAALEwEAmpwYAAAe8UlEQVR4nO3deXRV9b338ff3nJNzcjInQICQBAgqiAMgOCBVsXZyaG17bevt\nYMdrba3t7e1knz63t71dXb29tb21T2uttVVbvfr0qnWordanDlhBFAQHEByAQJgShgyEzPk+f5wD\nJpFAhISTvc/ntRYrOXvvnHx/K/Dhl+/+7b3N3RERkeCLZLoAEREZHgp0EZGQUKCLiISEAl1EJCQU\n6CIiIaFAFxEJiSEFupmVmNmdZrbGzF4ys/kHOGahma00s1Vm9vjwlyoiIgdjQ1mHbma3AE+4+41m\nFgfy3L2xz/4SYDHwLnffaGbl7l4/QjWLiMgBHDLQzawIeA6o8UEONrPPAxXu/r+Hv0QRERmK2BCO\nqQEagJvMbBawHPiSu7f2OeY4IMfMHgMKgWvd/XcD38jMLgcuB8jPz587Y8aMIyxfRCS7LF++fIe7\njzvQvqHM0OcBTwEL3H2pmV0LNLv7v/Y55ufAPOA8IAksAS5095cHe9958+b5smXL3vRgRESymZkt\nd/d5B9o3lJOidUCduy9Nv74TOOUAxzzo7q3uvgNYBMw63IJFROTNO2Sgu/s2YJOZTU9vOg9YPeCw\ne4GzzCxmZnnA6cBLw1qpiIgc1FB66ABXAbelV7isAz5pZlcAuPv17v6SmT0IPA/0Aje6+4sjUrGI\niBzQkJYtjgT10EVE3rwj7aGLiEgAKNBFREJCgS4iEhKBC/S121q45qG17GrtzHQpIiKjSuACfV3D\nHn7+6Ktsb27PdCkiIqNK4AI9GY8CsLezJ8OViIiMLoEL9Lx4aul8mwJdRKSfAAb6vhl6d4YrEREZ\nXQIb6G1dmqGLiPQVwEBPtVxaOxToIiJ9BS7Qk2q5iIgcUOACfX/LRSdFRUT6CVyg50QjxKMR9qqH\nLiLST+ACHVJtl70darmIiPQVyEDPi0d1YZGIyACBDPRkPKqWi4jIAIEM9Px4TCdFRUQGCGSgJ+NR\nWtVDFxHpJ5CBnheP6kpREZEBAhvoOikqItJfQANdPXQRkYECGuhRXfovIjJAIAM9GY/Sqhm6iEg/\ngQz0vJwYnd299PR6pksRERk1ghnouuOiiMgbBDPQE7rjoojIQMEM9PQMXX10EZHXBTLQkzmppxap\n5SIi8rpABroeciEi8kaBDPT8xL6Togp0EZF9Ahnor7dcFOgiIvsEMtC1bFFE5I0CHuiaoYuI7BPI\nQE/qpKiIyBsEMtDz4uqhi4gMNKRAN7MSM7vTzNaY2UtmNn+Q4041sx4zu2R4y+wvGjESsYh66CIi\nfcSGeNy1wIPufomZxYG8gQeYWRT4IfDQMNY3KD3kQkSkv0PO0M2sCDgb+A2Au3e6e+MBDr0KuAuo\nH84CB5MXjynQRUT6GErLpQZoAG4ysxVmdqOZ5fc9wMwmAe8Drj/YG5nZ5Wa2zMyWNTQ0HHbRsO+5\nomq5iIjsM5RAjwGnAL909zlAK3D1gGN+CnzD3Q86ZXb3G9x9nrvPGzdu3OHUu19ePEprh2boIiL7\nDKWHXgfUufvS9Os7eWOgzwPuMDOAscAFZtbt7vcMV6EDJeNRLVsUEenjkIHu7tvMbJOZTXf3tcB5\nwOoBx0zd97mZ3Qz8aSTDHFI99PqW9pH8FiIigTLUVS5XAbelV7isAz5pZlcAuPtB++YjJalVLiIi\n/Qwp0N19Jam2Sl8HDHJ3/8SRlTQ0+Wq5iIj0E8grRSHVcmnt0CoXEZF9AhvoyXiUti7N0EVE9gls\noOflROnqcbp6ejNdiojIqBDcQE/oBl0iIn0FN9D1kAsRkX5CEOiaoYuIQIADPZmjh1yIiPQV2EDX\nQy5ERPoLbqAn1EMXEekruIGuHrqISD/BDfQctVxERPoKbKAn4/tOiqrlIiICAQ70/IRaLiIifQU2\n0HNjqUBvVaCLiAABDvRIxEjmRNVyERFJC2ygQ2qli1ouIiIpgQ50PVdUROR1gQ70/HiMVrVcRESA\ngAe6nisqIvK6QAd6nlouIiL7BT7QNUMXEUkJeKDH9FxREZG0gAd6lNYOnRQVEYGAB7qWLYqIvC7Q\ngZ4Xj7K3qwd3z3QpIiIZF/BAj9HT63T29Ga6FBGRjAt4oKfvuNihtouISDgCXStdRESCHejJ9IOi\ndcdFEZGAB3pejh5yISKyT6ADvSA3NUNvadcMXUQk0IE+tiABQENLR4YrERHJvEAHenlRKtDrW9oz\nXImISOYFOtALEzFycyLUN2uGLiIS6EA3M8YX5VKvlouIyNAC3cxKzOxOM1tjZi+Z2fwB+z9iZs+n\n/yw2s1kjU+4blRcm1HIREQFiQzzuWuBBd7/EzOJA3oD964Fz3H23mZ0P3ACcPox1Dqq8MJeXtjYf\njW8lIjKqHXKGbmZFwNnAbwDcvdPdG/se4+6L3X13+uVTQOUw1zmocYUJtVxERBhay6UGaABuMrMV\nZnajmeUf5PhPA3850A4zu9zMlpnZsoaGhsMo943KixLs6ehmr64WFZEsN5RAjwGnAL909zlAK3D1\ngQ40s3NJBfo3DrTf3W9w93nuPm/cuHGHWXJ/5YW5AFrpIiJZbyiBXgfUufvS9Os7SQV8P2Z2MnAj\ncLG77xy+Eg9u/P616Ap0Ecluhwx0d98GbDKz6elN5wGr+x5jZtXA3cDH3P3lYa/yIPbP0LXSRUSy\n3FBXuVwF3JZe4bIO+KSZXQHg7tcD3wbGANeZGUC3u88bgXrfoLwwPUNXy0VEstyQAt3dVwIDA/r6\nPvs/A3xm+MoaupK8HOLRCNs1QxeRLBfoK0UhdbXouMIEDZqhi0iWC3ygg9aii4hASAJdl/+LiIQk\n0HWDLhGRkAR6eWGCxr1ddHTrUXQikr3CEehFWrooIhKOQN9/cZECXUSyVygCfVzhvmeL6sSoiGSv\nUAR6ue7nIiISjkAfk58gYuqhi0h2C0WgRyOWvrhILRcRyV6hCHRInRjdrhm6iGSxEAW6Lv8XkewW\nnkAvSmiVi4hktdAE+rjCXHa2dtLd05vpUkREMiI0gV5emMAdduzpzHQpIiIZEapABz2KTkSyV2gC\nfXxR6vL/LY0KdBHJTqEJ9OkTColHIzy7cXemSxERyYjQBHpuTpTZ1SUseW1npksREcmI0AQ6wPya\nMaza0kRTW1emSxEROepCFehn1Iyh1+GZ9bsyXYqIyFEXqkCfU11CPBbhqXVqu4hI9glVoOfmRDml\nuoSn1ivQRST7hCrQIdV2WbWlmaa96qOLSHYJXaDPrxmDOzy9QX10EckuoQv02dUlJNRHF5EsFLpA\nT8SizJ1cqvXoIpJ1QhfokOqjv7Stmca9ulGXiGSP0Aa6Ozz+ckOmSxEROWpCGehzqks4pryAH/x5\nja4aFZGsEcpAz4lG+PEHZtGwp4Pv3r8q0+WIiBwVoQx0gFlVJVy5cBp3P7uZv67alulyRERGXGgD\nHeALbz2WmROL+F9/fIH6Zt0nXUTCLdSBHo9F+PEHZ9Hc3s3ZP3qUb9/7IrU7WzNdlojIiDB3P/RB\nZiXAjcCJgAOfcvclffYbcC1wAbAX+IS7P3uw95w3b54vW7bs8Ct/E17Z3sINi9Zxz8rN9PQ67z+l\nkq+/azrlhbkHPL52ZyurtjSzeXcbmxvb6HWnuiyP6rI8jp9YRFVZ3lGpW0RkIDNb7u7zDrhviIF+\nC/CEu99oZnEgz90b++y/ALiKVKCfDlzr7qcf7D2PZqDvs725nV8vWsctSzaQiEX54nnH8NYZ5TS3\nd9Pc1sXy2t38ddV21m5v2f81+fEokYjR0t69f9v8mjFceloV7zxhArk50aM6BhHJbkcU6GZWBDwH\n1PggB5vZr4DH3P329Ou1wEJ33zrY+2Yi0PdZ17CH7/1pNY+u7b9OPWJw2tQy3jFzAqdNLaOqNI+i\nZAwzo3FvJ7U79/L3V3dwxzMb2bSrjbL8OJfNn8xl86dQlh/PyFhEJLscaaDPBm4AVgOzgOXAl9y9\ntc8xfwL+w93/nn79N+Ab7r5swHtdDlwOUF1dPbe2tvZwxzQsnlq3k+3N7RTl5lCYG6NmXMGQgrm3\n13nytR3c/OQG/ramntycCB+aV8Vnz5lGRUnyKFQuItnqSAN9HvAUsMDdl5rZtUCzu/9rn2MeAH4w\nINC/7u7LB3vfTM7Qh9Mr21v41aJ13LNiM2bwwXlVfG7hNCpL1WcXkeF3sEAfyiqXOqDO3ZemX98J\nnHKAY6r6vK4EtrzZQoPo2PGFXPOBWTz2tYV8cF4Vf1i2iYU/eowv3r6C5zY1Zro8Eckihwx0d98G\nbDKz6elN55Fqv/R1H3CZpZwBNB2sfx5GlaV5fP99J/H4187l42dO4ZE19Vz8iye55JeLeeD5rXT3\n9Ga6RBEJuaGucplNatliHFgHfBL4EIC7X59etvhz4F2kli1+cmD/fKCwtFwG09Lexf8sq+OmxevZ\ntKuNiuJcPjZ/CpeeWkWpTqCKyGE64mWLIyHsgb5PT6/zyJp6bnpyPYtf20k8FuHdJ1fwsfmTmV1V\nkunyRCRgDhbosaNdTLaJRoy3zxzP22eO5+XtLfx+SS13P1vHXc/WMb9mDP/8tmM5vWZMpssUkRDQ\nDD0DWtq7+MOyOq5//DUaWjqYXzOGzy2cxlnHjiXVvRIROTC1XEap9q4eblu6cX+wHze+gE8tmMrF\nsyeRjOsKVBF5IwX6KNfR3cN9K7fwm7+vZ822FgoTMS6aNZFL5lZySnWpZu0isp8CPSDcnaXrd/GH\nZZv4ywvbaOvqYfr4Qj65YArvnTNJ940REQV6EO3p6OaB57dw8+JaXtraTGleDu+YOYETK4s5aVIx\nx40vIC/++jnt9q4e1mxrYU97N9VleVSU5BKLhvruyCJZSYEeYPtm7bcs3sCSdTtp3Pv6M1JL83Ko\nLM2js7uXVxv20NP7+s8yFjFOrizmB+8/mekTCjNRuoiMAAV6SLg7mxvbeHFzE+t2tLJ5dxt1u9uI\nRowTKoo4oaKI4mScTbv2smFnK39Ytonm9m6+dcHxXDZ/snrxIiGgQM9SO/Z08LX/eY5H1zYwc2IR\nsaixq7WTRCzCNR+YxZzq0kyXKCJv0pHenEsCamxBgt9+4lS+d/EJJONRyvLjnDaljM6eXv7x10/x\nkB6eLRIqmqFnoR17Ovj0Lct4vq6Rf7toJp9YMDXTJYnIEGmGLv2MLUhwxz+dwduOH8937l/N9x9Y\nTW9vZv5jF5Hho0DPUsl4lOs/OpfL5k/m10+s56o7VtDe1ZPpskTkCOjmXFksGjG++54TmFSS5Ad/\nWUNDSwc///AcygtzM12aiBwGzdCznJnx2XOmce2ls1m5qZF3/Nci7l25mUydWxGRw6dAFwAunj2J\nP3/xLKaOzedLd6zk8t8vp76lPdNliciboECX/Y4pL+DOK87kWxccz6KXG3jHfy3i/uey4tGwIqGg\nQJd+ohHjn86u4YEvnsXkMflcdfsKrrztWXa1dma6NBE5BAW6HNAx5QXcdcV8vvbO6fx19Tbe+dNF\nPLa2PtNlichBKNBlULFohCvPPYZ7rlxAaV4On7jpGb5974u0tHcd+otF5KjTlaIyJO1dPfzng2v5\n7ZPrSeZEOf/ECVwyr5Izpo4hEtFNv0SOFt2cS4bNC3VN3P7MRu5fuYWWjm5mVRbz7XefwNzJutGX\nyNGgQJdh19bZw/3PbeHHD69le3MH751dwdXnH8+EYl2UJDKSdC8XGXbJeJQPnlrFI19ZyBfOPYY/\nv7iNt/3kcW5+cn2/B22IyNGjQJcjkp+I8dV3TufhL5/NnOoSvnP/at7/y8Ws2Lg706WJZB0FugyL\nyWPy+d2nTuPaS2dTt2sv77tuMe+77knuXbmZzu7eTJcnkhXUQ5dh19LexV3L67hlSS3rd7RSXpjg\nY2dM5sOnVzOmIJHp8kQCTSdFJSN6e53HX27gpsUbWPRyA/FYhHeeMIGzjh3LgmPGMqkkmekSRQLn\nYIGu2+fKiIlEjHNnlHPujHJerW/hpic38NCqbfvvD1NdlsesqhJmVRZzQkUxlaVJJhTnkhNVJ1Dk\ncGiGLkeVu7N2ewtPvrqTZRt28dymRrY0vX5Xx4jBpNIkp04u4/SaMuZOLqUsP0FhbkxBL4JaLjLK\n1be0s3ZbC1sa29i8u41X6vfw9Ppd7BxwQ7DCRIzZ1SWcUTOGU6eUMbE4l9L8OPnxKGa6WlWyg1ou\nMqqVF+a+4SlJ7s6r9Xt4YXMTzW1dNLd3U9/SzrINu/nRQ2v7HZuIRThpUjFn1IxJhf3UUhKx6NEc\ngsiooBm6BM6u1k6e29RIw54Odrd2sr25g+Ubd/Pi5iZ6ep2CRIxzZ5Rz/okTmF1VwviiXKK634yE\nhGboEipl+XHOnVH+hu0t7V08vX4XD6/ezl9Xb99/8jUnalSW5nHSpGIWTh/H2ceNY6yWT0oIaYYu\nodTd08uKTY28sn0Pm3bvZcOO1v19eTM4ZlwBMyYWMWNCISdUFDGrsoTS/HimyxY5pCOeoZvZBqAF\n6AG6B76ZmRUDtwLV6fe8xt1vOpKiRY5ELBrh1CllnDqlbP+23l7nxS1NPL62gefqmlixcXe/R+xV\nlSVZMG0sHz1jMidOKs5E2SJH5M20XM519x2D7LsSWO3u7zazccBaM7vN3fXcMhk1IhHj5MoSTq4s\n2b+tqa2LVVuaeL6uiZUbG7l35RbueGYTc6pLeMfMCeREU733CcW5nH/iRPXiZVQbrh66A4WWWjtW\nAOwCuofpvUVGTHEyhzOnjeXMaWOBVMDftbyOW5+q5YcPrul37PTxr/LNC2ZwznHjtExSRqUh9dDN\nbD2wm1Rw/8rdbxiwvxC4D5gBFAIfcvcHDvA+lwOXA1RXV8+tra094gGIjAR3p7Wzh33/Pp54ZQc/\nfHANtTv3cuqUUs6cNpbZ1SXMqSqhJE+9dzl6jvjCIjOrcPctZlYOPAxc5e6L+uy/BFgA/AswLX3M\nLHdvHuw9dVJUgqazu5dbn6rl/z6ziZfrW3AHM5hTVcJ5x4/n3OnlHD+xULN3GVHDeqWomX0H2OPu\n1/TZ9gDwH+7+RPr1I8DV7v70YO+jQJcg29PRzfN1jSxdt4tH19bzfF0TAOMKE5x17FjOOnYsZ9SM\nYWKxbkAmw+uIVrmYWT4QcfeW9OfvAP59wGEbgfOAJ8xsPDAdWHdkZYuMXgWJ2P7e+5fffhz1ze08\n9nIDT7yyg0fX1HP3s5uB1A3ITptaxkUnT+SsY8fppKqMqEPO0M2sBvhj+mUM+G93/76ZXQHg7teb\nWQVwMzARMFKz9VsP9r6aoUtY9fY6q7c2s3T9Lp5Zv4sl63bS1NbFpJIkH5hXyXtnT2LK2PxMlykB\npZtziWRQR3cP/291PXc8s5EnXkmt/J05sYgLT57Ie2ZVUFWWl+EKJUgU6CKjxJbGNv78wlYeeGEr\nKzY2AnBGTRmXzK3iwpMmkozrpmJycAp0kVGobvde7lmxmTuX17Fh517K8uN8asEUPjZ/CsXJnEyX\nJ6OUAl1kFHN3lq7fxa8ef41H1zZQkIjxqbdM5bNn15Cf0P3zpD/dbVFkFDOz/fdyX7WliV88+io/\n+9sr3P70Rr7y9uP4wLwqrY6RIdEzvURGkRMqirnuI3P54+fPpLosj6vvfoELrn2Cx9bWk6nfpiU4\nFOgio9Cc6lLuvGI+133kFNq7e/jETc9w2W+f5qWtg158LaJAFxmtzIwLTprIw18+h29fNJMXNjdx\n4c+e4Dv3raK5vSvT5ckopEAXGeXisQifestUHv/quXzk9MncsmQDb73mce5+tk5tGOlHgS4SEMV5\nOXzvvSdy35VvobI0yb/84Tk++KslasPIfgp0kYA5qbKYuz93Jj/8h5N4tX4PF/2fv/Pv96+mRW2Y\nrKdAFwmgSMT40KnVPPrVhVx6ahU3LV7P237yOA88v1VtmCymQBcJsJK8ON9/30n88fMLGFuQ4Mr/\nfpbLfvs0z21qzHRpkgEKdJEQmF1Vwr1XLuDf3p1aDXPxL57k0zc/w0oFe1bRpf8iIdPS3sXvltRy\nw6J1NLV1MX18IRfPqeDi2ZOYVKIHbgSd7uUikoVa2ru4Z8Vm7lm5heW1u4kYvGdWBV9467EcU16Q\n6fLkMCnQRbLcxp17uXVpLb9fUkt7dw8XnjSRf5hbyZnTxpCI6Za9QaJAFxEAdu7p4NdPrOfWp2rZ\n09FNQSLGOdPHMX18IZNKklSV5XHipCLy4rpv32ilQBeRftq7elj82g4eenE7j7/cwLbm9v37cqLG\n7KoS5k8by5nTxjCnukSz+FFEgS4iB9Xe1cOWxjZqd+5l6fpdLHltBy9sbqLXIRGLcOqUMuZOLmV2\nVQknVxYzpiCR6ZKzlu6HLiIHlZsTpWZcATXjCjh3RjkATW1dPL1+F4tf28GS13bys0deYd/874SK\nIt4zq4J3z6qgQitnRg3N0EVkSPZ0dPPi5iae3bibh1Zt33/x0qzKYuZPG8uCY8Ywd3Kp+u8jTC0X\nERl2G3a0cv9zW1j0SgMrNjbS3etEI8aMCYXMqS7h+IlFVBQnmViSy6SSJIW5ek7qcFCgi8iIau3o\n5pkNu1heu5sVGxtZuamRPR3d/Y4ZW5CgZmw+FSW5JONRErEo+YkoxckcSpJxipI5FCVjFOXmUJgb\nIycaIRYxzIw9Hd00t3XR2tlNRXGSytIksWh2XuiuHrqIjKj8RIyF08tZOD3Vf+/pdbY3t7O1qZ2t\nTW1s3LWXDTta2bBjL8tqd9PR3Ut7Vw97O3vo6X3zk8qcqFFdloeZ0dbZw97ObhyImBExIzcnQn48\nRl4iigEd3b10dvfu/9jZ00t3Ty8O4BCLGgW5MQoTORQncxhTEGdMfpwJxUlmTChkZkUR5YUJzEb3\ns10V6CIy7KIRo6IkmT5hWjroce5Oa2cPu1s7aWrrorm9i5b2bva0d9Pd20t3r9Pb6xTkpmbuyZwo\ndY1trGtoZcOOViIRSObESMYjRM3ocaenFzq6emjt7Ka1oweA0rwI8ViERCz1MR6LEIukZvhm0N3j\ntKS/d1NbF6u3NLNjTwfN7a//llGYiFGUzKEgEaMwN0Z5UYLxRbmMTa/46U3/x1RelGBSSR6TSpOU\n5qWOP1q/TSjQRSRjzIyCRIyCRIyqTBdzAM3tXazZ2sLqLU2s39FKS0c3rR2p0F+zrYVFL+94Q2vp\nQPLiUZI5URKxCLk5UT58ejWfOatm2OtVoIuIDKIoN4fTppZx2tSyQY9p70r9FhCNGO6wvbmdut1t\nbG5so7kt/RtHRxdtXT20d6VaTWNHaB2/Al1E5Ajk5vS/iraqLI+qsryM1JKdp4lFREJIgS4iEhIK\ndBGRkFCgi4iEhAJdRCQkFOgiIiGhQBcRCQkFuohISGTsbotm1gDUvokvGQvsGKFyRrNsHHc2jhmy\nc9zZOGY4snFPdvdxB9qRsUB/s8xs2WC3jAyzbBx3No4ZsnPc2ThmGLlxq+UiIhISCnQRkZAIUqDf\nkOkCMiQbx52NY4bsHHc2jhlGaNyB6aGLiMjBBWmGLiIiB6FAFxEJiUAEupm9y8zWmtmrZnZ1pusZ\nCWZWZWaPmtlLZrbKzL6U3l5mZg+b2Svpj4M/oDGgzCxqZivM7E/p19kw5hIzu9PM1qR/5vOzZNxf\nTv/9ftHMbjez3LCN28x+a2b1ZvZin22DjtHMvpnOtrVm9s4j+d6jPtDNLAr8AjgfmAn8o5nNzGxV\nI6Ib+Iq7Hw+cAVyZHufVwN/c/Vjgb+nXYfMl4KU+r7NhzNcCD7r7DGAWqfGHetxmNgn4IjDP3U8E\nosClhG/cNwPvGrDtgGNM/xu/FDgh/TXXpTPvsIz6QAdOA15193Xu3gncAVyc4ZqGnbtvdfdn05+3\nkPoHPonUWG9JH3YL8N6MFDhCzKwSuBC4sc/msI+5CDgb+A2Au3e6eyMhH3daDEiaWQzIA7YQsnG7\n+yJg14DNg43xYuAOd+9w9/XAq6Qy77AEIdAnAZv6vK5LbwstM5sCzAGWAuPdfSukQh8oz2BpI+Gn\nwNeB3j7bwj7mGqABuCndarrRzPIJ+bjdfTNwDbAR2Ao0uftfCfm40wYb47DmWxAC3Q6wLbRrLc2s\nALgL+Gd3b850PSPJzC4C6t19eaZrOcpiwCnAL919DtBK8NsMh5TuG18MTAUqgHwz+2hmq8q4Yc23\nIAR6HVDV53UlqV/TQsfMckiF+W3ufnd683Yzm5jePxGoz1R9I2AB8B4z20CqlfZWM7uVcI8ZUn+n\n69x9afr1naQCPuzjfhuw3t0b3L0LuBs4k/CPGwYf47DmWxAC/RngWDObamZxUicQ7stwTcPOzIxU\nT/Uld/9Jn133AR9Pf/5x4N6jXdtIcfdvunulu08h9XN9xN0/SojHDODu24BNZjY9vek8YDUhHzep\nVssZZpaX/vt+HqlzRWEfNww+xvuAS80sYWZTgWOBpw/7u7j7qP8DXAC8DLwGfCvT9YzQGN9C6let\n54GV6T8XAGNInRV/Jf2xLNO1jtD4FwJ/Sn8e+jEDs4Fl6Z/3PUBploz7u8Aa4EXg90AibOMGbid1\njqCL1Az80wcbI/CtdLatBc4/ku+tS/9FREIiCC0XEREZAgW6iEhIKNBFREJCgS4iEhIKdBGRkFCg\ni4iEhAJdRCQk/j9bM8w6Hia7ywAAAABJRU5ErkJggg==\n",
             "text/plain": [
-              "\u003cFigure size 600x400 with 1 Axes\u003e"
+              "{'checkpoint_path': \u003ctf.Tensor: shape=(), dtype=string, numpy=b'/tmp/model.ckpt'\u003e}"
             ]
           },
+          "execution_count": 28,
           "metadata": {},
-          "output_type": "display_data"
+          "output_type": "execute_result"
         }
       ],
       "source": [
@@ -372,23 +380,46 @@
         "losses = np.zeros([NUM_EPOCHS])\n",
         "m = Model()\n",
         "\n",
+        "train_ds = tf.data.Dataset.from_tensor_slices((train_images, train_labels))\n",
+        "train_ds = train_ds.batch(BATCH_SIZE)\n",
+        "\n",
         "for i in range(NUM_EPOCHS):\n",
-        "  for batch_idx in range(len(train_images) // BATCH_SIZE):\n",
-        "    batched_images = train_images[BATCH_SIZE*(batch_idx) : BATCH_SIZE * (batch_idx + 1)]\n",
-        "    batched_labels = train_labels[BATCH_SIZE*(batch_idx) : BATCH_SIZE * (batch_idx + 1)]\n",
-        "    result = m.train(\n",
-        "        x=tf.constant(batched_images, shape=(BATCH_SIZE, IMG_SIZE, IMG_SIZE),\n",
-        "                      dtype=tf.float32),\n",
-        "        y=tf.constant(batched_labels, shape=(BATCH_SIZE, 10), dtype=tf.float32))\n",
+        "  for x,y in train_ds:\n",
+        "    result = m.train(x, y)\n",
+        "\n",
         "  losses[i] = result['loss']\n",
         "  if (i + 1) % 10 == 0:\n",
-        "    print('Finished {0} epochs, current loss: {1}'.format(i + 1, losses[i]))\n",
+        "    print(f\"Finished {i+1} epochs\")\n",
+        "    print(f\"  loss: {losses[i]:.3f}\")\n",
         "\n",
         "# Save the trained weights to a checkpoint.\n",
-        "m.save('/tmp/model.ckpt')\n",
-        "\n",
-        "plt.plot(epochs, losses)\n",
-        "plt.show()"
+        "m.save('/tmp/model.ckpt')"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Lp2nkZj7rJXm"
+      },
+      "outputs": [
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEICAYAAABS0fM3AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90\nbGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsT\nAAALEwEAmpwYAAAqaklEQVR4nO3deXyddZn38c+VPTnZl6ZtkjYB2kIobSmlIItUZCmMlkV42ByX\nQRFHRMcZHMCZZ3AcH2fcRnFUZADBmVFGUZiKKDgooCLShbZ0ofuWpkvaJs3ebNfzxzkJaZrlNMnJ\naXJ/36/X/cq5l3Of6xfKuXL/VnN3REQkuBLiHYCIiMSXEoGISMApEYiIBJwSgYhIwCkRiIgEnBKB\niEjAxTQRmNliM9toZlvM7N5+zt9jZqsi21oz6zSz/FjGJCIix7JYjSMws0RgE3A5UAUsA25x9/UD\nXP9e4K/c/dLB7ltYWOjl5eWjHK2IyMS2YsWKg+5e1N+5pBh+7kJgi7tvAzCzJ4FrgH4TAXAL8KOh\nblpeXs7y5ctHLUgRkSAws50DnYtl1VAJsLvXflXk2HHMLANYDPw0hvGIiEg/YpkIrJ9jA9VDvRf4\ng7sf7vdGZneY2XIzW15TUzNqAYqISGwTQRVQ1mu/FKge4NqbGaRayN0fdvcF7r6gqKjfKi4RERmm\nWLYRLANmmFkFsIfwl/2tfS8ysxzgEuD9MYxFROKgvb2dqqoqWltb4x1KYKSlpVFaWkpycnLU74lZ\nInD3DjO7C3geSAQec/d1ZnZn5PxDkUuvA15w96ZYxQKw81ATL22s4br5JWSnRf8LEpHhq6qqIisr\ni/Lycsz6qy2W0eTuHDp0iKqqKioqKqJ+XyyfCHD354Dn+hx7qM/+48DjsYwDYH11Pf+wdB3nludT\nOVWJQGQstLa2KgmMITOjoKCAE21LDczI4oLMVAAON7XFORKRYFESGFvD+X0HJhHkh1IAONR0NM6R\niIicXAKTCAozI4mgUU8EIkGRmJjIvHnzmD17NjfeeCPNzc3DvtczzzzD+vUDjYcd2NKlS/nnf/7n\nQa+prq7mhhtuGG5oIxaYRJCdlkxigqlqSCRA0tPTWbVqFWvXriUlJYWHHjqmiZLOzs6o7zVYIujo\n6BjwfUuWLOHee4+bau0YU6dO5amnnoo6ltEWmESQkGDkh1JUNSQSUBdffDFbtmzhpZde4l3vehe3\n3norZ511Fp2dndxzzz2ce+65zJkzh+9973vHvffVV19l6dKl3HPPPcybN4+tW7eyaNEi7r//fi65\n5BK++c1v8vOf/5zzzjuPs88+m8suu4z9+/cD8Pjjj3PXXXcB8KEPfYi7776bCy64gFNOOaXny3/H\njh3Mnj275/rrr7+exYsXM2PGDD772c/2xPHoo48yc+ZMFi1axEc/+tGe+45UTHsNnWwKQimqGhKJ\nk8//fB3rq+tH9Z6VU7P5h/eeOeR1HR0d/PKXv2Tx4sUAvP7666xdu5aKigoefvhhcnJyWLZsGUeP\nHuXCCy/kiiuuOKb75QUXXMCSJUt4z3vec0wVTl1dHS+//DIAtbW1vPbaa5gZjzzyCF/+8pf52te+\ndlwse/fu5fe//z1vvfUWS5Ys6bdKaNWqVbzxxhukpqYya9YsPvnJT5KYmMgXvvAFVq5cSVZWFpde\neilz58494d9Zf4KVCDJTOKSqIZHAaGlpYd68eUD4ieD222/n1VdfZeHChT1f9C+88AJr1qzp+ev8\nyJEjbN68Oap++DfddFPP66qqKm666Sb27t1LW1vbgO+/9tprSUhIoLKysuepoa93v/vd5OTkAFBZ\nWcnOnTs5ePAgl1xyCfn54Zn6b7zxRjZt2hTdL2IIgUoE+aFU1u45Eu8wRAIpmr/cR1t3G0FfoVCo\n57W7861vfYsrr7zymGs+97nP8Ytf/AKg33v0vc8nP/lJPvOZz7BkyRJeeuklHnjggX7fk5qaesxn\nD3VNYmIiHR0dA147GgLTRgDhqqGDjWojEJG3XXnllXz3u9+lvb0dgE2bNtHU1MQXv/hFVq1a1ZME\nsrKyaGhoGPA+R44coaQkPMHyE088MepxLly4kJdffpna2lo6Ojr46U9Hb7LmwCWChtYO2jq64h2K\niJwkPvKRj1BZWcn8+fOZPXs2H/vYx/rtBXTzzTfzla98hbPPPputW7ced/6BBx7gxhtv5OKLL6aw\nsHDU4ywpKeH+++/nvPPO47LLLqOysrKn+mikYrZCWawsWLDAh7swzQ//tIv7n36T1+57N5Nz0kY5\nMhHpa8OGDZxxxhnxDmPCaGxsJDMzk46ODq677jr+4i/+guuuu+646/r7vZvZCndf0N99A/VEoNHF\nIjKePfDAAz0D5CoqKrj22mtH5b6BaizW6GIRGc+++tWvxuS+gXwi0OhikbEz3qqfx7vh/L4DlQi6\nZyBVzyGRsZGWlsahQ4eUDMZI93oEaWkn1gYaqKqh7LQkkhM135DIWCktLaWqquqE58eX4eteoexE\nBCoRmBl5GZpmQmSsJCcnn9BKWRIfgaoagnD1kKaZEBF5W/ASgWYgFRE5RvASQWaK2ghERHoJXCLI\n11TUIiLHCFwiKMxMpfFoB63t0a9MJCIykcU0EZjZYjPbaGZbzKzftdrMbJGZrTKzdWb2cizjAQ0q\nExHpK2bdR80sEfg2cDlQBSwzs6Xuvr7XNbnAd4DF7r7LzCbFKp5uBb0SwdTc9Fh/nIjISS+WTwQL\ngS3uvs3d24AngWv6XHMr8DN33wXg7gdiGA8QbiwGjS4WEekWy0RQAuzutV8VOdbbTCDPzF4ysxVm\n9oH+bmRmd5jZcjNbPtIRigWh8DQTqhoSEQmLZSKwfo71nXAkCTgH+DPgSuDvzWzmcW9yf9jdF7j7\ngqKiohEFlZ+pNgIRkd5iOcVEFVDWa78UqO7nmoPu3gQ0mdkrwFxgdFZk7kdWahIpiQkcVBdSEREg\ntk8Ey4AZZlZhZinAzcDSPtf8D3CxmSWZWQZwHrAhhjFhZuSHUjis0cUiIkAMnwjcvcPM7gKeBxKB\nx9x9nZndGTn/kLtvMLNfAWuALuARd18bq5i6FWRqUJmISLeYzj7q7s8Bz/U59lCf/a8AX4llHH3l\nh1I08ZyISETgRhZDeHSxJp4TEQkLZCLID6VwWFVDIiJAQBNBQWYKTW2dmm9IRISgJoLINBNqJxAR\nCWwiiCxi36B2AhGRQCaCaQUZAGytaYxzJCIi8RfIRHBqUSahlERW766LdygiInEXyESQmGDMLslh\nVdWReIciIhJ3gUwEAPPKctlQXU9bR1e8QxERiavAJoK5Zbm0dXbx1r76eIciIhJXgU0Ec0pzANRO\nICKBF9hEUJKbTmFmCqvVTiAiARfYRGBmzCnN1ROBiAReYBMBwNzSXLbUNNJ4tCPeoYiIxE2wE0FZ\nDu7wpqqHRCTAAp0I5pTmArC6qi6ucYiIxFOgE0F+KIVp+RlqJxCRQAt0IoDweII1qhoSkQBTIijN\nYU9dCwcaWuMdiohIXCgRlOUCsHxHbXwDERGJk8AngrPLcinMTOXpN/bEOxQRkbhIGuiEmQ01CY8B\ne9195uiGNLaSEhO4fn4Jj/1+Owcbj1KYmRrvkERExtRgTwRb3T17kC0LaBrs5ma22Mw2mtkWM7u3\nn/OLzOyIma2KbP93pAUajhvPKaWjy3lGTwUiEkCDJYL3RfH+Aa8xs0Tg28BVQCVwi5lV9nPp79x9\nXmT7xyg+c9TNKM5iblkuT62owt3jEYKISNwMmAjcfRuAmd1lZnmDXTOAhcAWd9/m7m3Ak8A1Iwk2\nlm44p5S39jWwdo+mpRaRYImmsXgysMzMfhyp6rEo710C7O61XxU51tc7zGy1mf3SzM7s70ZmdoeZ\nLTez5TU1NVF+/IlZMmcqKUkJPLVi99AXi4hMIEMmAnf/O2AG8CjwIWCzmf0/Mzt1iLf2lzD61rus\nBKa7+1zgW8AzA8TwsLsvcPcFRUVFQ4U8LDkZyVx55mT+Z3U1Rzs6Y/IZIiIno6i6j3q44nxfZOsA\n8oCnzOzLg7ytCijrtV8KVPe5b727N0ZePwckm1lh9OGPrhvOKaWuuZ3n3twbrxBERMbckInAzO42\nsxXAl4E/AGe5+8eBcxi8QXkZMMPMKswsBbgZWNrn3pO7q5rMbGEknkPDKskouOi0Qk6fnMXXXthE\na7ueCkQkGKJ5IigErnf3K939J+7eDuDuXcB7BnqTu3cAdwHPAxuAH7v7OjO708zujFx2A7DWzFYD\nDwI3exy77SQmGH//nkqqalt47A/b4xWGiMiYsmi+d81sPnAR4Tr+P7j7ylgHNpAFCxb48uXLY/oZ\nH3liOa9tO8Rv/uYSJmWlxfSzRETGgpmtcPcF/Z2Lpmro74EngALCTwffN7O/G90QTy73X306re2d\nfP2FTfEORUQk5qKpGroVONfd/8Hd/wE4H7gttmHF1ylFmXzgHeX89/LdrKvWFNUiMrFFkwh2AL3r\nR1KBrTGJ5iTyqXfPoCCUwt0/ekNrGovIhBZNIjgKrDOzx83s+8BaoNHMHjSzB2MbXvzkZCTzrVvm\ns+NQM/f8ZLWmnhCRCWvA2Ud7eTqydXspNqGcfN5xagH3Lj6dLz63gYdf2cbHLhlqDJ2IyPgzZCJw\n9yci4wC6p5ve2N2FNAg+cnEFq3bX8S+/eovKqdlcPCM2I5tFROIlml5Di4DNhGcS/Q6wyczeGduw\nTh5mxr/cMIcZk7K44wcrWLbjcLxDEhEZVdG0EXwNuMLdL3H3dwJXAv8a27BOLpmpSfzHRxYyJSeN\nD39/Gat218U7JBGRURNNIkh2943dO+6+CUiOXUgnp0lZafzXR88jL5TMBx79E2v3qFupiEwM0SSC\nFWb2aGQ1sUVm9u/AilgHdjKakpPODz9yPllpydz0vT/y8qbYTIktIjKWokkEdwLrgLuBTwHrI8cC\nqSw/g59+/AKmFYT4i8eX8eTru+IdkojIiAzaa8jMEoAV7j4b+PrYhHTym5yTxo8/dj6f+OEb3Puz\nN9l2sIl7rpxFcmJUs3qLiJxUBv3miswwutrMpo1RPONGVloyj35wAe8/fxoPv7KNm773R/bUtcQ7\nLBGRExbNn7BTCI8sftHMlnZvsQ5sPEhOTOCfrj2LB285m037G7n6m7/jV2v3xTssEZETEs3I4s/H\nPIpxbsncqcwpyeGTP3qDO/9zBdfMm8oD7z2TvFBKvEMTERlSNE8EV7v7y7034OpYBzbelBeG+Nlf\nXsBfXTaT597cy+X/+rKeDkRkXIgmEVzez7GrRjuQiSA5MYFPXTaDpXddRHF2Gnf+5wr++serqW8N\nzIwcIjIODZgIzOzjZvYmMMvM1vTatgNvjl2I488ZU7J55hMXcvelp/H0G1Vc9Y3f8dq2uC3FLCIy\nqAGXqjSzHCAP+BJwb69TDe4etwl3xmKpytG0clctn/nvVew41MyN55Ry39VnkK+2AxEZY8NaqtLd\nj7j7Dne/BagC2gmvWZyp7qTRmz8tj+c+dTF3XnIqT7+xh0u/9hJPvr5L6xuIyEkjmtlH7wL2A78G\nfhHZno1xXBNKRkoS9151Or/81MXMKs7i3p+9yW2P/Ildh5rjHZqISFSNxZ8GZrn7me5+VmSbE83N\nzWyxmW00sy1mdu8g151rZp1mdkOUcY9LM4qzePKO8/nS9WexpuoIV37jFR77/XY6OrviHZqIBFg0\niWA3cMJTbZpZIuE1DK4CKoFbzKxygOv+BXj+RD9jPDIzblk4jV9/5p2cf0o+//jset77b39gudY5\nEJE4iSYRbANeMrP7zOwz3VsU71sIbHH3be7eBjwJXNPPdZ8EfgociDrqCWBKTjqPfehcvnPbfOqa\n27jhoT/y1z9ezd4jmqZCRMZWNCOLd0W2lMgWrRLCTxPdqoDzel9gZiXAdcClwLkncO8Jwcy4+qwp\nLJpVxLd+s4VHf7edn6+p5gPnT+cv33WaeheJyJiIZs3i46aYMLNoEoj1d7s++98A/tbdO836u7zn\n8+4A7gCYNm3idVjKSEnibxefzm3nTeMb/7uZx/6wnR+9voubzp3Ghy8spyw/I94hisgENtg4gt+7\n+0WR1//h7n/e69xKd58/6I3N3gE84O5XRvbvA3D3L/W6ZjtvJ4xCoBm4w92fGei+420cwXBs3t/A\nv/12C79Ys5cud66onMyHLyxnYUU+gyVMEZGBDDaOYLC/7EO9Xs/ue88oPncZMMPMKoA9wM3Arb0v\ncPeKXkE+Djw7WBIIihnFWXzz5rO576oz+MEfd/DD13fxq3X7qJySzYcuLGfJ3KmkJSfGO0wRmSAG\nayz2AV73t3/8m907gLsI9wbaAPzY3deZ2Z1mFtgVzk7E5Jw0Prv4dP5477v50vVn0dHVxWefWsP5\nX3qRf3p2PdtqGuMdoohMAINVDW0D/ppwsvgK8Dfdp4Avu/upYxJhH0GoGhqIu/PHbYf4r9d28fy6\nfXR0OXNKc7iispjLKyczszhTVUci0q/BqoYGSwTfH+ym7v7hUYjthAU5EfR2oKGVn67Yw/Pr9rFq\ndx0ApxaFuHZeCdfMK2FagRqYReRtw0oEJyslguMdqG/lhfX7Wbq6mte3hwemVU7JZtGsIhbNmsT8\nabkkaT1lkUBTIgiQPXUt/Hx1Nb956wArdtbS2eUUZ6dy04Iyblo4jZLc9HiHKCJxoEQQUPWt7fxu\n00GeWrGblzbVYMDFM4q4fn4Jl1cWk5ESzXAQEZkIlAiEqtpm/nvZbn62cg976lrISElk8ZmTuWFB\nKedXFJCQoEZmkYlsRInAzG4EfuXuDWb2d8B84J/cfeXohzo0JYKR6epylu+s5ek3qnh29V4ajnZQ\nlp/O++aXct3ZJUwvCA19ExEZd0aaCNa4+xwzu4jwamVfBe539/MGfWOMKBGMnpa2Tp5ft4+frNjN\nq1sP4Q7zp+Vy3fxSlsyZSk5GcrxDFJFRMtJE8Ia7n21mXwLedPcfdh+LRbBDUSKIjeq6Fpaurubp\nlXvYuL+BlMQELj+zmBvml3LRjEKS1etIZFwbaSJ4lvAUEZcB5wAtwOvuPne0A42GEkHsras+wk+W\nV/E/q/ZQ29xOfiiFq2ZP5r1zp7Jgep66ooqMQyNNBBnAYsJPA5vNbApwlru/MPqhDk2JYOy0dXTx\n0sYDLF1dzf9u2E9rexdZaUm8c0YRl8wq4pKZRRRnp8U7TBGJwnAnnes2BfiFux81s0XAHOAHoxee\nnKxSkhK44szJXHHmZJqOdvDyphpe2niAlzbW8Is39wIwqziLS2YVceWZkzm7LFe9j0TGoWieCFYB\nC4BywhPILSW8hvHVsQ6uP3oiiD93Z8PeBl7ZXMMrm2pYtuMw7Z3OlJw0rpo9hSvOLFYVkshJZqRV\nQyvdfb6ZfRZocfdvqbFYeqtvbed/1+/nuTf38sqmg7R1dpGTnsylp0/i8spiLplZRChVg9dE4mmk\nVUPtZnYL8AHgvZFj6lcoPbLTkrl+finXzy+l8WgHv9tUw6837Oe3bx3g6Tf2kJKUwMWnFXJZZTGX\nnj5J7QoiJ5loEsGHgTuBL7r79shCM/8Z27BkvMpMTeKqs6Zw1VlT6OjsYvnOWl5Yt58X1u/jxbcO\nADC7JJvLz5jMVWdNZsYkTZ0tEm9RTTFhZinAzMjuRndvj2lUg1DV0Pjk7mza38iLb+3nxQ0HWLmr\nFnc4pSjUU320YHo+KUlqVxCJhZG2ESwCngB2EF6Upgz4oLu/MqpRRkmJYGI4UN/K8+v28cu1+3oa\nm0MpiSysyGdBeT7nTM9jbmku6SlaklNkNIw0EawAbnX3jZH9mcCP3P2cUY80CkoEE0/j0Q5e3XKQ\nlzfV8Pr2w2w+EF6CMynBqJyazfxpeSwoz2NheT6T1L4gMiyjMtfQUMfGihLBxFfX3MbKXbWs2Bne\nVu8+Qkt7JwAVhSEWTM9j3rRc5pXlMqs4S91URaIw0kTwfaAL+I/IoduAJC1VKWOlvbOL9dX1vL79\nMH/afpiVu2o53NQGQEZKImdPy+Xc8nzOLc9nblkumeqqKnKckSaCVOATwEWE2wheAb7j7kdHO9Bo\nKBGIu7PrcDOrdtexYmcty3bU8ta+etwhwWBmcRbzynKZXZLDmVOzOWNKNmnJamuQYBt2IjCzBGCN\nu8+OVXAnSolA+lPf2s7KnbWs2l3HG7vqWLW7jiMt4c5tCQZl+RmcVpTJaZMyKcvPoDQvnbL8DMoL\nQiRqWgwJgGEPKHP3LjNbbWbT3H3XMD54MfBNIBF4xN3/uc/5a4AvEK566gA+7e6/P9HPEclOS2bR\nrEksmjUJCD817KlrYV11Peuq69l6oJEtBxr53ebwyOduGSmJzJ6aw5zSHGZNzuKUokxOK8rUWgwS\nKNFUDf0GOBd4HWjqPu7uS4Z4XyKwCbgcqAKWAbe4+/pe12QCTe7uZjYH+LG7nz7YffVEICPR1eXs\nb2ilqraFHQebWFddz6rddazfW09bx9sJojg7lVmTszljchYzi8PbqZNCWudZxq2RTjHx+WF+7kJg\ni7tviwTxJHAN0JMI3L2x1/UhYHwtoCzjTkKCMSUnnSk56Zxbns+NkeMdnV3srm1h64FGttY0snF/\nA2/tbeD7Ww/1PEGYQWleOrOKs5hRnMXpk7M4c2o2FYWZql6ScW3ARGBmpwHF7v5yn+PvJLxQzVBK\ngN299quA45a3NLPrCC+BOQn4syjuKzLqkhITqCgMUVEY4jKKe453dHax83Azm/c3sGl/I5v2N7B5\nfyMvbayhoyv8d0tacgKzit9+cjhjSjZzy3LISlP1kowPgz0RfAO4v5/jzZFz7+3nXG/9/Yl03F/8\n7v408HQkwXyB8Epox97I7A7gDoBp06YN8bEioycpMYFTizI5tSiTxb26TLR1dLG1ppH1kTaIt/bV\n89uNNfxkRRUQfnqYVZzFnNIcTinKpKIwxKlFIcoLQhr3ICedAdsIzGztQL2FzOxNdz9r0BubvQN4\nwN2vjOzfB+DuXxrkPduBc9394EDXqI1ATmaHm9pYu+cIK3fVsnJXHev2HOFQZMwDQGpSArMmZzGr\nOIvpBRmU5WcwLT+DisIQuRkpcYxcJrrhthEMNpY/PYrPXQbMiMxWuge4Gbi1T2CnAVsjjcXzgRTg\nUBT3Fjkp5YdSeOfMIt45s6jnWH1rOzsONrF5fyMb9tazIfL0cLDx2KE4eRnJVBSGmBZJDmX5GZxS\nFOKUwkzyQkoSEjuDJYJlZvZRd//33gfN7HZgxVA3dvcOM7uL8KpmicBj7r7OzO6MnH8IeB/wATNr\nB1qAmzya6VBFxpHstGTmlOYypzT3mOMtbZ1U1Taz81AzOw41se1gE9trmli2o5alq6vp6vV/Qm5G\nMjMmZXLapCxmFmdy+uRsKqdmk5OudggZucGqhoqBp4E23v7iX0D4r/br3H3fmETYh6qGJAjaOrrY\nU9fC9oONbKtpYmtNE1sOhBusuwfKAZTlp3P65GxmFmf2NFafUhQiNUkjqeVYI51i4l1Ad1vBOnf/\nzSjHd0KUCCTI3J2axqM9jdTrq+vZuL+BHQebenoxJSYY5QUZzJiURUVRqKc3VHlBiMLMFC0EFFAj\nSgQnGyUCkeO1dXSx/WATm/Y3sGl/A2/ta2BbTSO7DjfT3vn2/+OhlETKC0PMmJTJjJ7urlmU5KYr\nQUxww2os7l60fogbD3mNiMReSndvpMlZxxzv6OwKj6I+1MSOg03sONTMtoNNvL79MM+squ65Ljst\niVmTsyjLz6AsL9xIfc70PErzMsa6KBIHgzUWn2FmawY5b0DOKMcjIqMoKTGB8sIQ5YUhmHXsuYbW\ndjZ192TaW8/mA428tvUQT9fvobuiYGpOGueU51NRGGJ6fgbTCzIoLwxREFIV00QyWCIYdM6fiM7R\nCkRExlZWWjLnTM/jnOl5xxw/2tHJ1gNNLN8ZWf9hZy3Prqmmdy1yVlpSODkUhCgvyGB6QYjZJdnM\nmJSl6TbGIbURiMiQjnZ0sqe2hZ2Hm9lxsIntkW3noWb21LXQGWmozkpNYm5ZLjOLsygvDCeI6fkZ\nlOSlk6wR1XE10knnRCTgUpMSOaUok1OKMo+rYmrv7GLX4WZW765j5a7wmhBPLttFc9vbFQaJCUZJ\nbjrTC8LVS9PzQz3VTNPyM7RwUJwpEYjIiCT3mo/p+vmlwNvdXHccbGbnoaaeQXO7DjezdFU19a0d\nx9xjWn4Gs0uymV2Sw+mTsyjLCz9FaNrvsTHkb9nMQkBLZJGamYTbDn7p7u1DvFVEAsrMmJSVxqSs\nNBZW5B93vq65rSc57DzUzMZ9Dby55wjPvXnsONX8UApTc9OYmpPO1Nx0JmWnUpyVxpTcNE6fnE2+\npt4YFdGk21eAi80sD3gRWA7cRHgRexGRE5abkUJuRgpzy3KPOV7X3MbWmiaqapupqm2hqraFvUfC\n3V9f3XqIxqPHPklMykpl1uQsSvPSe5JFaV46pfkZTM5OU8N1lKJJBObuzZE5hr7l7l82szdiHZiI\nBE9uRgrnTE85ridTt6ajHRxoOMruw+GniA376tlyINwF9mBj2zHXJiUYpXnpTCsIMS0/nen5IaYV\nhCf0m5qbTnZakrrARkSVCCJTSt8G3H4C7xMRGVWh1CQqUsNdV3vP8ArQ2t7J3iOtVNU2s/twC7tr\nm9l1uJldh5pZtav2uHaJUEoiU3LTKcxMoSCUSl4omfxQKkWZKRRkpjIpK5UpuelMykqd8D2eovlC\n/zRwH/B0ZPbQU4DfxjQqEZETlJac2DOvUn+ONLez83C4wbq6roW9R1rZW9fK4aY2Nu5v4HBTG7XN\nbfTtUZ9gkJeRQk5GMrnpyRRkplKal05JpBpqck46U3PSKMhMHbdVUSc0jsDMEoBMd6+PXUiD0zgC\nEYmVjs4uapvbOdh4lP31rew70kr1kVYONx2lrrmduuZ2DjS0UlXbckz3WAh3kS3MTGFSVhpFWank\npCeTnZZETnoyhVmpFGWmUpSVytTcdIrj0H4xonEEZvZD4E7Co4hXADlm9nV3/8rohikiEl9JiQkU\nZYW/sM+Ykj3gde5OXXM7e+pa2Heklb31rew70sKB+qMcaAgnkc0HGqhv6aC+tf24p4zEBGNydhqT\nssNVUEVZ4d5Qk7JTmZSdRlGkaio/lDImS5tGUzVU6e71ZnYb8Bzwt4QTghKBiASSmZEXSiEvlMLs\nksGnXOvo7OJwcxsHG9rY39BKdV1LZGvlQEMr22qaeG3b4WPWmXj7cyAnPVwllZORwo3nlPL+86eP\nenmiSQTJZpYMXAv8m7u3m9n4mpdCRCROkhITesZUVDLwU0Zreyc1DeEnipqGo9Q0hn/WNbdR29xO\nXXNbzKqTokkE3wN2AKuBV8xsOhC3NgIRkYkoLTkxPA14/thP/T1kInD3B4EHex3aGVm1TEREJoAh\nWyHMLMfMvm5myyPb14D++2eJiMi4E01z9GNAA/B/Ils98P1YBiUiImMnmjaCU939fb32P29mq2IU\nj4iIjLFonghazOyi7h0zuxBoiebmZrbYzDaa2RYzu7ef87eZ2ZrI9qqZzY0+dBERGQ3RPBHcCfzA\nzLo7y9YCHxzqTWaWCHwbuByoApaZ2VJ3X9/rsu3AJe5ea2ZXAQ8D551IAUREZGSi6TW0GphrZtmR\n/Xoz+zQw2ML2AAuBLe6+DcDMngSuAXoSgbu/2uv614DSE4peRERGLOqxy+5e32uOoc9E8ZYSYHev\n/arIsYHcDvyyvxNmdkd3r6Wampqo4hURkegMdxKLaIa39XdNvyOSI+MSbic8fcXxb3J/2N0XuPuC\noqKi/i4REZFhGu66AtFMMVEFlPXaLwWq+15kZnOAR4Cr3P3QMOMREZFhGjARmFkD/X/hG5Aexb2X\nATPMrALYA9wM3NrnM6YBPwP+3N03RRu0iIiMngETgbtnjeTG7t5hZncBzwOJwGORhW3ujJx/CPi/\nQAHwnciScR0DzZctIiKxcUIL05wMtDCNiMiJG2xhmom9EKeIiAxJiUBEJOCUCEREAk6JQEQk4JQI\nREQCTolARCTglAhERAJOiUBEJOCUCEREAk6JQEQk4JQIREQCTolARCTglAhERAJOiUBEJOCUCERE\nAk6JQEQk4JQIREQCTolARCTglAhERAJOiUBEJOCUCEREAk6JQEQk4GKaCMxssZltNLMtZnZvP+dP\nN7M/mtlRM/ubWMYiIiL9S4rVjc0sEfg2cDlQBSwzs6Xuvr7XZYeBu4FrYxWHiIgMLpZPBAuBLe6+\nzd3bgCeBa3pf4O4H3H0Z0B7DOEREZBCxTAQlwO5e+1WRYyfMzO4ws+VmtrympmZUghMRkbBYJgLr\n55gP50bu/rC7L3D3BUVFRSMMS0REeotlIqgCynrtlwLVMfw8EREZhlgmgmXADDOrMLMU4GZgaQw/\nT0REhiFmvYbcvcPM7gKeBxKBx9x9nZndGTn/kJlNBpYD2UCXmX0aqHT3+ljFJSIix4pZIgBw9+eA\n5/oce6jX632Eq4xERCRONLJYRCTglAhERAJOiUBEJOCUCEREAk6JQEQk4JQIREQCTolARCTglAhE\nRAJOiUBEJOCUCEREAk6JQEQk4JQIREQCTolARCTglAhERAJOiUBEJOCUCEREAk6JQEQk4JQIREQC\nTolARCTglAhERAJOiUBEJOBimgjMbLGZbTSzLWZ2bz/nzcwejJxfY2bzYxmPiIgcL2aJwMwSgW8D\nVwGVwC1mVtnnsquAGZHtDuC7sYpHRET6F8sngoXAFnff5u5twJPANX2uuQb4gYe9BuSa2ZQYxiQi\nIn3EMhGUALt77VdFjp3oNSIiEkNJMby39XPMh3ENZnYH4aojgEYz23gCcRQCB0/g+okiiOUOYpkh\nmOUOYplhZOWePtCJWCaCKqCs134pUD2Ma3D3h4GHhxOEmS139wXDee94FsRyB7HMEMxyB7HMELty\nx7JqaBkww8wqzCwFuBlY2ueapcAHIr2HzgeOuPveGMYkIiJ9xOyJwN07zOwu4HkgEXjM3deZ2Z2R\n8w8BzwFXA1uAZuDDsYpHRET6F8uqIdz9OcJf9r2PPdTrtQOfiGUMDLNKaQIIYrmDWGYIZrmDWGaI\nUbkt/F0sIiJBpSkmREQCbkIngqGmuJgIzKzMzH5rZhvMbJ2ZfSpyPN/Mfm1mmyM/8+Id62gzs0Qz\ne8PMno3sB6HMuWb2lJm9Fflv/o6AlPuvIv++15rZj8wsbaKV28weM7MDZra217EBy2hm90W+2zaa\n2ZUj+ewJmwiinOJiIugA/trdzwDOBz4RKee9wIvuPgN4MbI/0XwK2NBrPwhl/ibwK3c/HZhLuPwT\nutxmVgLcDSxw99mEO5/czMQr9+PA4j7H+i1j5P/xm4EzI+/5TuQ7b1gmbCIguikuxj133+vuKyOv\nGwh/MZQQLusTkcueAK6NS4AxYmalwJ8Bj/Q6PNHLnA28E3gUwN3b3L2OCV7uiCQg3cySgAzC440m\nVLnd/RXgcJ/DA5XxGuBJdz/q7tsJ97xcONzPnsiJIHDTV5hZOXA28CeguHtMRuTnpDiGFgvfAD4L\ndPU6NtHLfApQA3w/UiX2iJmFmODldvc9wFeBXcBewuONXmCClztioDKO6vfbRE4EUU1fMVGYWSbw\nU+DT7l4f73hiyczeAxxw9xXxjmWMJQHzge+6+9lAE+O/OmRIkXrxa4AKYCoQMrP3xzequBvV77eJ\nnAiimr5iIjCzZMJJ4L/c/WeRw/u7Z3KN/DwQr/hi4EJgiZntIFzld6mZ/ScTu8wQ/jdd5e5/iuw/\nRTgxTPRyXwZsd/cad28HfgZcwMQvNwxcxlH9fpvIiSCaKS7GPTMzwnXGG9z9671OLQU+GHn9QeB/\nxjq2WHH3+9y91N3LCf93/Y27v58JXGYAd98H7DazWZFD7wbWM8HLTbhK6Hwzy4j8e3834bawiV5u\nGLiMS4GbzSzVzCoIr+ny+rA/xd0n7EZ4+opNwFbgc/GOJ0ZlvIjwI+EaYFVkuxooINzLYHPkZ368\nY41R+RcBz0ZeT/gyA/OA5ZH/3s8AeQEp9+eBt4C1wH8AqROt3MCPCLeBtBP+i//2wcoIfC7y3bYR\nuGokn62RxSIiATeRq4ZERCQKSgQiIgGnRCAiEnBKBCIiAadEICIScEoEIn2YWaeZreq1jdroXTMr\n7z27pMjJIKYrlImMUy3uPi/eQYiMFT0RiETJzHaY2b+Y2euR7bTI8elm9qKZrYn8nBY5XmxmT5vZ\n6sh2QeRWiWb275H59V8ws/S4FUoEJQKR/qT3qRq6qde5endfCPwb4RlQibz+gbvPAf4LeDBy/EHg\nZXefS3hOoHWR4zOAb7v7mUAd8L6YlkZkCBpZLNKHmTW6e2Y/x3cAl7r7tshEf/vcvcDMDgJT3L09\ncnyvuxeaWQ1Q6u5He92jHPi1hxcawcz+Fkh2938ag6KJ9EtPBCInxgd4PdA1/Tna63UnaquTOFMi\nEDkxN/X6+cfI61cJz4IKcBvw+8jrF4GPQ8/6ytljFaTIidBfIiLHSzezVb32f+Xu3V1IU83sT4T/\niLolcuxu4DEzu4fwCmIfjhz/FPCwmd1O+C//jxOeXVLkpKI2ApEoRdoIFrj7wXjHIjKaVDUkIhJw\neiIQEQk4PRGIiAScEoGISMApEYiIBJwSgYhIwCkRiIgEnBKBiEjA/X/Z0l6EtGqTAQAAAABJRU5E\nrkJggg==\n",
+            "text/plain": [
+              "\u003cFigure size 600x400 with 1 Axes\u003e"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "plt.plot(epochs, losses, label='Pre-training')\n",
+        "plt.ylim([0, max(plt.ylim())])\n",
+        "plt.xlabel('Epoch')\n",
+        "plt.ylabel('Loss [Cross Entropy]')\n",
+        "plt.legend();"
       ]
     },
     {
@@ -420,7 +451,7 @@
       "outputs": [],
       "source": [
         "SAVED_MODEL_DIR = \"saved_model\"\n",
-        "m = Model()\n",
+        "\n",
         "tf.saved_model.save(\n",
         "    m,\n",
         "    SAVED_MODEL_DIR,\n",
@@ -465,11 +496,80 @@
       "outputs": [],
       "source": [
         "interpreter = tf.lite.Interpreter(model_content=tflite_model)\n",
+        "interpreter.allocate_tensors()\n",
         "\n",
-        "train = interpreter.get_signature_runner(\"train\")\n",
-        "infer = interpreter.get_signature_runner(\"infer\")\n",
-        "save = interpreter.get_signature_runner(\"save\")\n",
-        "restore = interpreter.get_signature_runner(\"restore\")"
+        "infer = interpreter.get_signature_runner(\"infer\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "vTOM4alkteTO"
+      },
+      "source": [
+        "Compare the output of the original model, and the converted lite model:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "IDdaCmPEtE7P"
+      },
+      "outputs": [],
+      "source": [
+        "logits_original = m.infer(x=train_images[:1])['logits'][0]\n",
+        "logits_lite = infer(x=train_images[:1])['logits'][0]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "IpoZ1nTMKGEZ"
+      },
+      "outputs": [
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEWCAYAAABv+EDhAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90\nbGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsT\nAAALEwEAmpwYAAAclElEQVR4nO3df5xVdb3v8dcbBEadAVQ6BKIOlZnyQ8hRK+ue4ZpUmv1QK8VS\n6pRplge1zrU8XbDsHu+J0nuPx59HQhPhnot1LaU6mU1Ul7pCoYC/01FGLBUVZkSQH5/7x1rQnmGG\n2QOz95rh+34+HvvB7LW/e30/3z3Deu/13WuvpYjAzMzSM6DoAszMrBgOADOzRDkAzMwS5QAwM0uU\nA8DMLFEOADOzRDkArDCSQtJbymw7S9Lt+c+HSmqTNDC/P1LSYkmtkr6jzPckvSzp/1VyDGb9mQPA\ndpJvXLfftkl6reT+2V08p1FSSzXqi4hnIqI2Irbmi84DXgSGRsSlwLuBk4AxEXFcNWqqtDzULpf0\njKT1khZIGrqL9pMk/VrSOkktkv7r7q6rjNrqJf1S0gZJj0h6b4fH3yDpDkmv5KE8b3f7st7lALCd\n5BvX2oioBZ4BTi1Z1hf/8x4GPBR//VbjYUBzRLza0xVJ2qdXK+s95wCfAk4ARgP7Av+yi/Z3AIuB\nA4G/BS6Q9KHdXFd35gN/BA4CLgcWSnpDyeM/AP5M9nv5G2D2HvRlvSkifPOtyxvQDLw3/3kIcA2w\nJr9dky/bH3gN2Aa05bfRwHHAEuAV4DngWmBwyboDeEsX/Y4FfgW0Aj/Pn3t7/lh9/tx9gLnAZuD1\nvN/PAxuBrfn9K/LnfBBYntfyf4GJHcb4X4AHgU35et+Rt3sFeABoLGnfBHwT+G1e338AI0oef3fJ\nc1cD00tev9lkofoX4AZg3zJ/DwuBr5Tcf1c+zv26aL8BOKrk/v8GvlrOuoBhwC357+xZ4EpgYBf9\nvDV/zepKlv0aOD//eWr++nb6fN+KvXkPwHricrIN4yTgaLIN/D9G9k77A8Ca+OuewhqyjfDFwAjg\nncCJwBfK7OsOYFn+3G8C53bWKCKmA/OAf877vRE4H1iS358p6e3AHLJwOAi4EfiRpCElqzoLOAUY\nDowE7iHb8B0IfBm4s8O72mnAp8ne0Q7O2yDpUOAnZO+o35C/Vsvz5/x3sg3mJOAtwMFA6dTMK5Le\n3cXrofxWen8IcHgX7a8BzpE0SNIRZK//vWWu61ZgS17jZLKN+Ge76Gcc8GREtJYseyBfDtnfy6PA\nrZLWSrpf0t92sS6rMgeA9cTZwDci4vmIeAG4gmwqoVMRsSwifhcRWyKimWzD2+1//nwjeizw9YjY\nFBGLgR/vQd2fA26MiN9HxNaIuJXsXes7Str8z4hYHRGvAZ8EFkXEoojYFhE/B5YCJ5e0/15EPJa3\n/3eyjTpkr9G9ETE/IjZHxNqIWC5JeR0XR8RL+QbzvwFnbl9hRAyPiN90MYafAJ/N59uHke2xAOzX\nRfu7gTPI9sweAW6JiPu7W5ekkWRhPiMiXo2I54GrS+vsoBZY12HZOqAu/3kMWYD8Engj8B3gLkkj\nulifVZEDwHpiNPB0yf2n82WdkvRWSXdL+rOk9WQbvHL+448GXo72c/hPd9W4DIcBl+bvsF+R9Apw\nSIfaV3do/7EO7d8NjCpp8+eSnzeQbQjJ1/unTmp4A9nGelnJOn+aLy/HHLK59iZgFdkGFWCnD94l\nHZiv+xtATV7T+yRt3/va1boOAwYBz5XUeSPZng6SVpUcEPAesmm2jh8gDyWbGoMsgJoj4pY8EBeQ\nvdYnlDluqyAHgPXEGrINxHaH5ssgm5Pv6Hqyd5+HR8RQ4Gu0n3roynPAAZL279DX7loNfCt/h739\ntl9EzC9pEx3af79D+/0j4qoy+3pzJ8tfJNsYjitZ57DIPmjvVr4nMjMi6iNiDNmG+9n81tGbgK0R\ncVu+99UCLCDfg+lmXavJ9o5GlNQ5NCLG5c8dVzLN9+v8uW+SVFfS/9H5csg+V/Eph/soB4D1xHzg\nH/PD+kaQzV/fnj/2F+CgfEphuzpgPdAm6W3ABeV0EhFPk025XCFpcD4vfuoe1H0zcL6k4/NDIPeX\ndEqHjVap24FTJb1P0kBJNflhrmPK6Gse8F5JH5e0j6SDJE2KiG15HVdL2v5u+mBJ7ytnAJIOlPTm\nvP6jgO+STcdt66T5Y9lTNE3SAElvBD5BNje/y3VFxHNkH2p/R9LQ/Plv7mrePiIeI/uMY2b+On0U\nmAjcmTf5IVmYn5u/lmeQffbx23LGbZXlALCeuJJsw/wgsAL4Q76MiHiELCCezKcORpN9MDqNbDrg\nZuB/9aCvacDxwEvATOC23S06IpaSzb9fC7wMPAFM30X71cCHyfZYXiB7V/wVyvj/EhHPkL3TvjSv\nfTnZO2LI5tqfAH6XT4ndCxyx/bkl0yqdGQEsAl4lm8OfExE3lTz3Bkk35DWsB04j+wD+5byGlcC3\nylkX2WGig4GH8ucvpP30V0dnAg1526uAM/LPiIiIl4APkf0trAMuAz4cES/uYn1WJYrw3pmZWYq8\nB2BmligHgJlZohwAZmaJcgCYmSWqr574qlMjRoyI+vr6ivfz6quvsv/++3ffcC/ksac39lTHDemM\nfdmyZS9GxE5fOuxXAVBfX8/SpUsr3k9TUxONjY0V76cv8tgbiy6j6lIdN6QzdkmdfpPeU0BmZoly\nAJiZJcoBYGaWqH71GUBnNm/eTEtLCxs3buy1dQ4bNoyHH36419bXF9TU1DBmzBgGDRpUdClm1kf0\n+wBoaWmhrq6O+vp6slOu77nW1lbq6ro6T1j/ExGsXbuWlpYWxo4dW3Q5ZtZH9PspoI0bN3LQQQf1\n2sZ/bySJgw46qFf3ksys/+v3AQB4418Gv0Zm1tFeEQBmZtZz/f4zgI7qL7unV9fXfNUp3bZpaWnh\nwgsv5KGHHmLbtm188IMf5Nvf/jaDBw9u127NmjVcdNFFLFy4cJfrO/nkk7njjjsYPnx4j+udNWsW\ntbW1fPnLX+7xc81sz/VkG9RcM638Fc/qeOnlPec9gD0UEZx22ml85CMf4fHHH+exxx6jra2Nyy+/\nvF27LVu2MHr06G43/gCLFi3arY2/mVlPOAD20H333UdNTQ2f/vSnARg4cCBXX301c+bM4brrruNj\nH/sYp556KlOnTqW5uZnx48cDsGHDBj7+8Y8zceJEPvGJT3D88cfvOM1FfX09L774Is3NzRx55JF8\n7nOfY9y4cUydOpXXXnsNgJtvvpljjz2Wo48+mtNPP50NGzYU8wKYWb9V8QCQNEfS85JWliybJelZ\nScvz28mVrqNSVq1axTHHHNNu2dChQzn00EPZsmULS5Ys4dZbb+W+++5r1+a6667jgAMO4MEHH+Tr\nX/86y5Yt63T9jz/+OBdeeCGrVq1i+PDh3HlndqnV0047jfvvv58HHniAI488kltuuaUyAzSzvVY1\n9gDmAu/vZPnVETEpvy2qQh0VERGdHmGzfflJJ53EgQceuNPjv/nNbzjzzDMBGD9+PBMnTux0/WPH\njmXSpEkAHHPMMTQ3NwOwcuVK3vOe9zBhwgTmzZvHqlWremdAZpaMigdARCwmuzj2XmncuHE7naF0\n/fr1rF69moEDB3Z5qtlyr8U8ZMiQHT8PHDiQLVu2ADB9+nSuvfZaVqxYwcyZM32Mv5n1WJFHAX1R\n0jnAUuDSiHi5s0aSzgPOAxg5ciRNTU3tHh82bBitra0VK7K7dR933HG0tbVx4403Mm3aNLZu3cqM\nGTOYNm0aAwYM4PXXX9+xjra2NrZt20ZrayvHHnss8+bNo6GhgUceeYQVK1bw6quv0traSkTQ1tbW\nrj3Apk2b2LRpE62traxfv566ujpeeuklbrvtNkaNGkVrayubNm1i0KBBnda9cePGnV6/jtra2rpt\ns7dKdeypjhsqM/ZLJ2wpu23TgCvKX3EFfkdFBcD1wDeByP/9DvCZzhpGxE3ATQANDQ3R8dzdDz/8\ncLvTNpRz2GZ3enoqiLvuuosvfOELzJ49m23btnHyyScze/Zs5s+fz+DBg3esq7a2lgEDBlBXV8fF\nF1/MueeeywknnMDkyZOZOHEio0ePpq6uDknU1tYC7GgP2d7A5s2bqaur48orr+TEE0/ksMMOY8KE\nCTtqHjJkCEOGDOm0/pqaGiZPnrzLsaRyfvTOpDr2VMcNlRn79B4dBjqz/BWf1fuHgRYSABHxl+0/\nS7oZuLuIOnrLIYccwo9//OOdlk+fPp3p06fvuF9fX8/Kldln4TU1Ndx+++3U1NTwpz/9acfGHNgx\nzz9ixIgd7YF2x/ZfcMEFXHDBBTv1OWvWrF4YkZmloJAAkDQqIp7L734UWLmr9nujDRs2MGXKFDZv\n3kxEcP311+/0xTEzs0qqeABImg80AiMktQAzgUZJk8imgJqBz1e6jr6mrq6uKpe3NDPrSsUDICLO\n6mSxD1o3MyuYvwlsZpYoB4CZWaIcAGZmidrrTgfNrGF7vIp2R9CXcQrW2tpa2tra2i274YYb2G+/\n/TjnnHOYO3cuU6dOZfTo0Xtcm5lZb9n7AqCPOP/883f8PHfuXMaPH+8AMLM+xQFQIdsvzFJfX8/S\npUs5++yz2XfffVmyZAkPPfQQl1xyCW1tbYwYMYK5c+cyatSooks2s8T4M4AKO+OMM2hoaGDevHks\nX76cffbZhy996UssXLiQZcuW8ZnPfGani8eYmVWD9wCq7NFHH2XlypWcdNJJAGzdutXv/s2sEA6A\nKosIxo0bx5IlS4ouxcwS5ymgKqirq9txeuYjjjiCF154YUcAbN682RdzMbNC7H17AGUcttmdnp4O\nesOGDYwZM2bH/UsuuaTd49OnT+f888/f8SHwwoULueiii1i3bh1btmxhxowZjBs3bo/rNjPrib0v\nAAqwbdu2XT5++umnc/rpp++4P2nSJBYvXlzpsszMdslTQGZmiXIAmJklaq8IgHIvsJ4yv0Zm1lG/\nD4CamhrWrl3rDdwuRARr166lpqam6FLMrA/p9x8CjxkzhpaWFl544YVeW+fGjRv3uo1lTU1NuyOV\nzMz6fQAMGjSIsWPH9uo6m5qamDx5cq+u08ysr+n3U0BmZrZ7HABmZolyAJiZJcoBYGaWKAeAmVmi\nHABmZolyAJiZJcoBYGaWKAeAmVmiHABmZolyAJiZJcoBYGaWKAeAmVmiHABmZolyAJiZJariASBp\njqTnJa0sWXagpJ9Lejz/94BK12FmZu1VYw9gLvD+DssuA34REYcDv8jvm5lZFVU8ACJiMfBSh8Uf\nBm7Nf74V+Eil6zAzs/ZUjYupS6oH7o6I8fn9VyJieMnjL0dEp9NAks4DzgMYOXLkMQsWLKh4vW1t\nbdTW1la8n77IY09v7KmOGyoz9hXPriu77YQBT5W/4lGTel5MbsqUKcsioqHj8j5/TeCIuAm4CaCh\noSEaGxsr3mdTUxPV6Kcv8tgbiy6j6lIdN1Rm7NMvu6fsts01M8tf8VnlB0u5ijoK6C+SRgHk/z5f\nUB1mZskqKgB+BJyb/3wucFdBdZiZJasah4HOB5YAR0hqkfR3wFXASZIeB07K75uZWRVV/DOAiDir\ni4dOrHTfZmbWNX8T2MwsUQ4AM7NEOQDMzBLlADAzS5QDwMwsUQ4AM7NEOQDMzBLV588FZGb9T32P\nzoczrfwVz+r98+GkzHsAZmaJcgCYmSXKAWBmligHgJlZohwAZmaJcgCYmSXKAWBmlih/D8CsQnws\nvPV13gMwM0uUA8DMLFEOADOzRDkAzMwS5QAwM0uUA8DMLFEOADOzRDkAzMwS5QAwM0uUA8DMLFEO\nADOzRDkAzMwS5QAwM0uUA8DMLFEOADOzRDkAzMwS5QAwM0tUoVcEk9QMtAJbgS0R0VBkPWZmKekL\nl4ScEhEvFl2EmVlqypoCknRCOcvMzKz/UER030j6Q0S8vbtlPe5cegp4GQjgxoi4qZM25wHnAYwc\nOfKYBQsW7EmXZWlra6O2trbi/fRFHnvvjX3Fs+VfvH3CgKfKX/GoST0vZhcq8Tv32MtTrbFPmTJl\nWWdT7LsMAEnvBN4FzACuLnloKPDRiDh6tyvK1j86ItZI+hvg58CXImJxV+0bGhpi6dKle9JlWZqa\nmmhsbKx4P32Rx97Ya+urv+yests210wrf8Wzyt/AlKMSv3OPvTzVGrukTgOguymgwUAt2WcFdSW3\n9cAZu11NLiLW5P8+D/wQOG5P12lmZuXZ5YfAEfEr4FeS5kbE073ZsaT9gQER0Zr/PBX4Rm/2YWZm\nXdtlAEi6JiJmANdK2mmuKCI+tAd9jwR+KGl7HXdExE/3YH1mZtYD3R0G+v3839m93XFEPAns0WcI\nZma2+7qbAlqW//ur6pRjZmbVUtYXwSStIDtUs9Q6YClwZUSs7e3CzMysssr9JvBPyE7XcEd+/0xA\nZCEwFzi11yszM7OKKjcAToiI0m/+rpD024g4QdInK1GYmZlVVrlnA62VdPz2O5KOI/t+AMCWXq/K\nzMwqrtw9gM8CcyTVkk39rAf+Lj9+/58qVZyZmVVOWQEQEfcDEyQNIzt9xCslD/97JQozM7PKKvds\noMMkfRf4BXCvpO/kYWBmZv1UuZ8BzCG7cMvH89t64HuVKsrMzCqv3M8A3hwRp5fcv0LS8grUY2Zm\nVVLuHsBrkt69/U5+MZjXKlOSmZlVQ7l7AOcDt5XM+78MnFuZkszMrBrKPQroAeBoSUPz++slzQAe\nrGBtZmZWQeVOAQHZhj8i1ud3L6lAPWZmViU9CoAO1GtVmJlZ1e1JAHR/NXkzM+uzursiWCudb+gF\n7FuRiszMrCq6uyBMXbUKMTOz6tqTKSAzM+vHHABmZolyAJiZJarcbwKbmfUL9ZfdU3bbuUc/DrM+\nXF7jWet2s6K+y3sAZmaJcgCYmSXKAWBmligHgJlZohwAZmaJcgCYmSXKAWBmligHgJlZohwAZmaJ\ncgCYmSXKAWBmlqhCA0DS+yU9KukJSZcVWYuZWWoKCwBJA4F/BT4AHAWcJemoouoxM0tNkXsAxwFP\nRMSTEfE6sAAo87R8Zma2pxRRzLXdJZ0BvD8iPpvf/xRwfER8sUO784DzAEaOHHnMggULdqu/Fc+W\nfyrXsftuonbTmvIaj5rU6/1PGPBU2W3L6b/IsRc57p72X4nfe1FSHXdPtbW1UVtbW3QZFTdlypRl\nEdHQcXmR1wNQJ8t2SqOIuAm4CaChoSEaGxt3q7PpPTpH+FM0PjqzvMZnlfcfrSf9N9eU2XeZ/Rc5\n9iLH3dP+K/F7L0qq4+6ppqYmdnebsjcocgqoBTik5P4YoMy3IWZmtqeKDID7gcMljZU0GDgT+FGB\n9ZiZJaWwKaCI2CLpi8DPgIHAnIhYVVQ9ZmapKfSawBGxCFhUZA1mZqnyN4HNzBLlADAzS5QDwMws\nUYV+BmBmldF81Sllt21qatrrju+38iQTAP4PYWbWnqeAzMwS5QAwM0uUA8DMLFEOADOzRDkAzMwS\n5QAwM0uUA8DMLFEOADOzRDkAzMwS5QAwM0uUA8DMLFEOADOzRDkAzMwS5QAwM0uUA8DMLFEOADOz\nRDkAzMwS5QAwM0uUA8DMLFEOADOzRDkAzMwS5QAwM0uUA8DMLFEOADOzRDkAzMwS5QAwM0uUA8DM\nLFEOADOzRBUSAJJmSXpW0vL8dnIRdZiZpWyfAvu+OiJmF9i/mVnSPAVkZpYoRUT1O5VmAdOB9cBS\n4NKIeLmLtucB5wGMHDnymAULFlS8vra2Nmpra3t1nSueXVd22wkDnip/xaMm9WrfY/fdRO2mNYX0\n3dvj7mn/vT32/qISf+/9RSpjnzJlyrKIaOi4vGIBIOle4I2dPHQ58DvgRSCAbwKjIuIz3a2zoaEh\nli5d2qt1dqapqYnGxsZeXWf9ZfeU3ba5Zlr5K57V/QauJ33PPfpxGh+dWUjfvT3unvbf22PvLyrx\n995fpDJ2SZ0GQMU+A4iI95bTTtLNwN2VqsPMzDpX1FFAo0rufhRYWUQdZmYpK+oooH+WNIlsCqgZ\n+HxBdZiZJauQAIiITxXRr5mZ/ZUPAzUzS5QDwMwsUQ4AM7NEOQDMzBLlADAzS1SRJ4Mzq7jmq04p\nu21TUxOctfd8w9esOw6ABHgjaGad8RSQmVmiHABmZolyAJiZJcoBYGaWKAeAmVmiHABmZolyAJiZ\nJcoBYGaWKAeAmVmiHABmZonyqSCsonpyGgrwKSjMqskBUCXeEJpZX+MpIDOzRDkAzMwS5QAwM0uU\nA8DMLFEOADOzRDkAzMwS5QAwM0uUA8DMLFEOADOzRCkiiq6hbJJeAJ6uQlcjgBer0E9f5LGnJ9Vx\nQzpjPywi3tBxYb8KgGqRtDQiGoquowgee3pjT3XckPbYwVNAZmbJcgCYmSXKAdC5m4ouoEAee3pS\nHTekPXZ/BmBmlirvAZiZJcoBYGaWKAdAB5LeL+lRSU9IuqzoeqpB0iGSfinpYUmrJP190TVVm6SB\nkv4o6e6ia6kmScMlLZT0SP77f2fRNVWLpIvzv/eVkuZLqim6pmpzAJSQNBD4V+ADwFHAWZKOKraq\nqtgCXBoRRwLvAC5MZNyl/h54uOgiCvA/gJ9GxNuAo0nkNZB0MHAR0BAR44GBwJnFVlV9DoD2jgOe\niIgnI+J1YAHw4YJrqriIeC4i/pD/3Eq2ETi42KqqR9IY4BTg34qupZokDQX+E3ALQES8HhGvFFpU\nde0D7CtpH2A/YE3B9VSdA6C9g4HVJfdbSGhDCCCpHpgM/L7gUqrpGuAfgG0F11FtbwJeAL6XT3/9\nm6T9iy6qGiLiWWA28AzwHLAuIv6j2KqqzwHQnjpZlsxxspJqgTuBGRGxvuh6qkHSB4HnI2JZ0bUU\nYB/g7cD1ETEZeBVI5XOvA8j27scCo4H9JX2y2KqqzwHQXgtwSMn9MSSyWyhpENnGf15E/KDoeqro\nBOBDkprJpvz+s6Tbiy2palqAlojYvre3kCwQUvBe4KmIeCEiNgM/AN5VcE1V5wBo737gcEljJQ0m\n+1DoRwXXVHGSRDYP/HBEfLfoeqopIr4aEWMiop7s931fRCTxTjAi/gyslnREvuhE4KECS6qmZ4B3\nSNov//s/kUQ+AC+1T9EF9CURsUXSF4GfkR0VMCciVhVcVjWcAHwKWCFpeb7saxGxqLiSrEq+BMzL\n3/A8CXy64HqqIiJ+L2kh8Aeyo+D+SIKnhfCpIMzMEuUpIDOzRDkAzMwS5QAwM0uUA8DMLFEOADOz\nRDkALHmS3ihpgaQ/SXpI0iJJb5W0shf7mCvpjPznpvyMsw/mZ+G8VtLw3urLrFwOAEta/iWgHwJN\nEfHmiDgK+BowssJdnx0RE4GJwCbgrgr3Z7YTB4ClbgqwOSJu2L4gIpZTclJASfWSfi3pD/ntXfny\nUZIWS1qen1P+Pfl1Bebm91dIunhXnednnf0H4FBJR1dkhGZd8DeBLXXjge5OBPc8cFJEbJR0ODAf\naACmAT+LiG/l15LYD5gEHJyfY55ypnYiYqukB4C3AQ/s7kDMesoBYNa9QcC1kiYBW4G35svvB+bk\nJ9L7PxGxXNKTwJsk/QtwD1DuKYY7OxOtWUV5CshStwo4pps2FwN/IbtiVgMwGCAiFpNdUOVZ4PuS\nzomIl/N2TcCFlHGRmXzvYQIJnozMiuUAsNTdBwyR9LntCyQdCxxW0mYY8FxEbCM7ad7AvN1hZNcS\nuJnsbKpvlzQCGBARdwJfp5vTK+d7D/8ErI6IB3tvWGbd8xSQJS0iQtJHgWskXQZsBJqBGSXNrgPu\nlPQx4JdkF04BaAS+Imkz0AacQ3YFue9J2v7m6qtddD1P0iZgCHAvCVx61Poenw3UzCxRngIyM0uU\nA8DMLFEOADOzRDkAzMwS5QAwM0uUA8DMLFEOADOzRP1/Zb13QFXGF0UAAAAASUVORK5CYII=\n",
+            "text/plain": [
+              "\u003cFigure size 600x400 with 1 Axes\u003e"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "#@title\n",
+        "def compare_logits(logits):\n",
+        "  width = 0.35\n",
+        "  offset = width/2\n",
+        "  assert len(logits)==2\n",
+        "\n",
+        "  keys = list(logits.keys())\n",
+        "  plt.bar(x = np.arange(len(logits[keys[0]]))-offset,\n",
+        "      height=logits[keys[0]], width=0.35, label=keys[0])\n",
+        "  plt.bar(x = np.arange(len(logits[keys[1]]))+offset,\n",
+        "      height=logits[keys[1]], width=0.35, label=keys[1])\n",
+        "  plt.legend()\n",
+        "  plt.grid(True)\n",
+        "  plt.ylabel('Logit')\n",
+        "  plt.xlabel('ClassID')\n",
+        "\n",
+        "  delta = np.sum(np.abs(logits[keys[0]] - logits[keys[1]]))\n",
+        "  plt.title(f\"Total difference: {delta:.3g}\")\n",
+        "\n",
+        "compare_logits({'Original': logits_original, 'Lite': logits_lite})"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ARUb37Hqa0Az"
+      },
+      "source": [
+        "Above, you can see that the behavior of the model is not changed by the conversion to TFLite."
       ]
     },
     {
@@ -484,8 +584,15 @@
         "\n",
         "Note: Since training tasks are resource intensive, you should consider performing them when users are not actively interacting with the device, and as a background process. Consider using the [WorkManager](https://developer.android.com/topic/libraries/architecture/workmanager) API to schedule model retraining as an asynchronous task.\n",
         "\n",
-        "On Android, you can perform on-device training with TensorFlow Lite using either Java or C++ APIs. In Java, use the `Interpreter` class to load a model and drive model training tasks. The following example shows how to run the training procedure using the `runSignature` method:\n",
-        "\n",
+        "On Android, you can perform on-device training with TensorFlow Lite using either Java or C++ APIs. In Java, use the `Interpreter` class to load a model and drive model training tasks. The following example shows how to run the training procedure using the `runSignature` method:\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "qvbqRxnNs4NG"
+      },
+      "source": [
         "```Java\n",
         "try (Interpreter interpreter = new Interpreter(modelBuffer)) {\n",
         "    int NUM_EPOCHS = 100;\n",
@@ -535,102 +642,16 @@
         "\n",
         "    // ...\n",
         "}\n",
-        "```\n",
-        "\n",
-        "You can see a complete code example of model retraining inside an Android app in the [model personalization demo app](https://github.com/tensorflow/examples/blob/master/lite/examples/model_personalization/android/transfer_api/src/main/java/org/tensorflow/lite/examples/transfer/api/LiteMultipleSignatureModel.java)."
+        "```\n"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
-        "id": "nG6tKCGZFVdi"
-      },
-      "source": [
-        "To continue training in TensorFlow Lite, let's first load back the checkpoint produced in the previous training step:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "gjCwJNqFFg0p",
-        "outputId": "57c534ba-06cb-4e26-cd25-0f478bf582f7"
+        "id": "cPEzyHAZs7Gl"
       },
-      "outputs": [
-        {
-          "data": {
-            "text/plain": [
-              "{'dense_1/bias:0': array([-7.67073184e-02, -1.21641919e-01,  4.14139330e-02, -2.90217489e-01,\n",
-              "        -4.34946269e-01, -1.53480902e-01,  2.02373281e-01,  4.13616449e-01,\n",
-              "         2.50497848e-01,  3.43462646e-01, -3.96950319e-02,  1.52519522e-02,\n",
-              "         2.49562711e-02, -1.55837715e-01, -3.16922516e-01, -5.62852323e-02,\n",
-              "        -1.76316097e-01,  2.87876248e-01,  4.48857918e-02, -3.90259027e-02,\n",
-              "        -2.69743860e-01, -4.55486029e-01,  2.64676839e-01,  1.04185253e-01,\n",
-              "         2.97393054e-02, -3.33435744e-01, -3.89613062e-02, -2.91077465e-01,\n",
-              "         7.05280155e-02,  3.18744779e-02, -1.16621424e-02,  3.46110873e-02,\n",
-              "         2.70473305e-03, -1.98793784e-01, -1.23570330e-01,  1.16852365e-01,\n",
-              "        -2.23538131e-01,  4.12918150e-01, -3.03259581e-01, -1.99832648e-01,\n",
-              "         1.52513504e-01,  3.15144747e-01,  3.00966620e-01, -1.00083448e-01,\n",
-              "         9.55451950e-02, -1.57265306e-01,  3.52542669e-01,  1.29614398e-01,\n",
-              "         3.26297469e-02, -3.68497908e-01,  1.42719388e-01,  2.72650450e-01,\n",
-              "         2.69632284e-02,  1.37559623e-01,  2.46696085e-01,  1.71969056e-01,\n",
-              "         1.19491518e-01, -1.31593585e-01, -1.63813666e-01,  7.23824799e-02,\n",
-              "         1.61473617e-01,  5.33181906e-01, -2.75753364e-02,  4.44153160e-01,\n",
-              "         8.46541375e-02,  3.36195767e-01,  3.26056778e-02,  3.92714828e-01,\n",
-              "         3.89477015e-02, -3.28640640e-01,  9.64739323e-02, -1.01042725e-02,\n",
-              "         2.30138421e-01, -4.07512598e-02, -6.15786470e-04, -1.46631449e-02,\n",
-              "        -1.36841565e-01,  1.49732353e-02, -1.89761370e-01, -1.26753841e-02,\n",
-              "         4.88439389e-02,  4.62675467e-02, -1.90456375e-01, -5.31349368e-02,\n",
-              "         3.48409154e-02,  1.93095192e-01, -4.83356506e-01, -2.59570003e-01,\n",
-              "         7.41846859e-02,  9.38728079e-03, -3.04004701e-04, -1.36721238e-01,\n",
-              "        -1.16710924e-02, -2.00157687e-02, -7.50867277e-02, -3.15536827e-01,\n",
-              "         4.39007461e-01,  4.55190897e-01, -1.07847728e-01, -8.64643753e-02,\n",
-              "        -1.01913996e-02,  9.17089507e-02,  1.69289440e-01,  1.00701176e-01,\n",
-              "        -9.03776381e-03,  2.51386195e-01,  1.40138015e-01,  3.12402308e-01,\n",
-              "         6.00705966e-02, -7.93265551e-02, -4.15610790e-01,  1.43614545e-01,\n",
-              "         4.35782447e-02, -9.56265163e-03, -2.26040453e-01,  5.05020507e-02,\n",
-              "        -9.90978070e-03,  2.14746892e-01, -1.18627608e-01,  1.22824619e-02,\n",
-              "         1.98436931e-01,  1.31338120e-01,  9.98383909e-02, -1.14558823e-02,\n",
-              "         5.15056774e-02, -3.36984848e-03, -1.63100529e-02,  8.35391134e-02],\n",
-              "       dtype=float32),\n",
-              " 'dense_1/kernel:0': array([[ 0.02093004, -0.06344204,  0.00277872, ..., -0.00021458,\n",
-              "         -0.06049109,  0.03728996],\n",
-              "        [ 0.03705189,  0.04696848,  0.06514321, ...,  0.05242993,\n",
-              "          0.01620064,  0.02033125],\n",
-              "        [ 0.07786449, -0.030255  ,  0.07235963, ..., -0.00665735,\n",
-              "          0.02599101, -0.04836893],\n",
-              "        ...,\n",
-              "        [-0.00411804,  0.0267068 ,  0.02204224, ..., -0.07286698,\n",
-              "         -0.06099217, -0.01177335],\n",
-              "        [ 0.02460986,  0.01525712, -0.01274556, ...,  0.07480299,\n",
-              "          0.05886368, -0.03610951],\n",
-              "        [ 0.01595567, -0.03645648,  0.03281055, ..., -0.04087327,\n",
-              "         -0.06164488,  0.02983134]], dtype=float32),\n",
-              " 'dense_2/bias:0': array([ 0.17403616, -0.21789859, -0.05201611,  0.32887268, -0.34431872,\n",
-              "         0.5943497 , -0.01262981,  0.16121244, -0.05095961, -0.58063805],\n",
-              "       dtype=float32),\n",
-              " 'dense_2/kernel:0': array([[-0.5706187 ,  0.22103083, -0.20097178, ..., -0.7579967 ,\n",
-              "          0.11828423, -0.2794073 ],\n",
-              "        [ 0.02920151, -0.17417166, -0.0385471 , ..., -0.8737432 ,\n",
-              "         -0.13760662,  0.41177538],\n",
-              "        [ 0.25003332,  0.03562389, -0.10078474, ..., -0.1751328 ,\n",
-              "         -0.03559012, -0.07295281],\n",
-              "        ...,\n",
-              "        [-0.13618511, -0.13016236, -0.12449092, ...,  0.1511517 ,\n",
-              "          0.17886253, -0.04126478],\n",
-              "        [-0.19565038,  0.14290941,  0.12058044, ..., -0.10880192,\n",
-              "          0.11445104, -0.19768406],\n",
-              "        [-0.7715236 , -0.00254173, -0.0163954 , ..., -0.05197521,\n",
-              "         -0.1495111 , -0.1617497 ]], dtype=float32)}"
-            ]
-          },
-          "execution_count": 40,
-          "metadata": {},
-          "output_type": "execute_result"
-        }
-      ],
       "source": [
-        "restore(checkpoint_path=np.array(\"/tmp/model.ckpt\", dtype=np.string_))"
+        "You can see a complete code example of model retraining inside an Android app in the [model personalization demo app](https://github.com/tensorflow/examples/blob/master/lite/examples/model_personalization/android/transfer_api/src/main/java/org/tensorflow/lite/examples/transfer/api/LiteMultipleSignatureModel.java)."
       ]
     },
     {
@@ -646,29 +667,54 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "pjQ5xrhyGcIQ",
-        "outputId": "cf665d5f-0808-4823-ad74-6031a7824d53"
+        "id": "pjQ5xrhyGcIQ"
       },
       "outputs": [
         {
           "name": "stdout",
           "output_type": "stream",
           "text": [
-            "Finished 1 epochs, current loss: 5.6542158126831055\n",
-            "Finished 2 epochs, current loss: 5.654355049133301\n",
-            "Finished 3 epochs, current loss: 5.653238296508789\n",
-            "Finished 4 epochs, current loss: 5.6533918380737305\n",
-            "Finished 5 epochs, current loss: 5.65273904800415\n",
-            "Finished 6 epochs, current loss: 5.651922702789307\n",
-            "Finished 7 epochs, current loss: 5.6514458656311035\n",
-            "Finished 8 epochs, current loss: 5.6505255699157715\n",
-            "Finished 9 epochs, current loss: 5.649590492248535\n",
-            "Finished 10 epochs, current loss: 5.649048328399658\n"
+            "Finished 10 epochs\n",
+            "  loss: 0.223\n",
+            "Finished 20 epochs\n",
+            "  loss: 0.216\n",
+            "Finished 30 epochs\n",
+            "  loss: 0.210\n",
+            "Finished 40 epochs\n",
+            "  loss: 0.204\n",
+            "Finished 50 epochs\n",
+            "  loss: 0.198\n"
           ]
-        },
+        }
+      ],
+      "source": [
+        "train = interpreter.get_signature_runner(\"train\")\n",
+        "\n",
+        "NUM_EPOCHS = 50\n",
+        "BATCH_SIZE = 100\n",
+        "more_epochs = np.arange(epochs[-1]+1, epochs[-1] + NUM_EPOCHS + 1, 1)\n",
+        "more_losses = np.zeros([NUM_EPOCHS])\n",
+        "\n",
+        "\n",
+        "for i in range(NUM_EPOCHS):\n",
+        "  for x,y in train_ds:\n",
+        "    result = train(x=x, y=y)\n",
+        "  more_losses[i] = result['loss']\n",
+        "  if (i + 1) % 10 == 0:\n",
+        "    print(f\"Finished {i+1} epochs\")\n",
+        "    print(f\"  loss: {more_losses[i]:.3f}\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "vX7dQXx_iPuv"
+      },
+      "outputs": [
         {
           "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAD4CAYAAADlwTGnAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90\nbGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsT\nAAALEwEAmpwYAAAktElEQVR4nO3dd3hUdd7+8fdnUggJIbQkgDQpQXqLFBEXFMW22LAAggVFXMTy\nqD/dfR636q6rrq6sAgJ2FF3bqouLuqw0aSZI702qJKEmoaR9f39kUMkGMoGQM5O5X9flNcOc8525\nZy5z7pnvzDnHnHOIiEj48XkdQEREvKECEBEJUyoAEZEwpQIQEQlTKgARkTAV6XWA8qhXr55r1qyZ\n1zFEREJKenp6lnMuseTtIVUAzZo1Iy0tzesYIiIhxcy+K+12TQGJiIQpFYCISJhSAYiIhCkVgIhI\nmFIBiIiEKRWAiEiYUgGIiISpkNoPIBQVFjm27zvEpsxcNmXl0rVJLbo0qe11LBERFUBFOXA4n02Z\nOWzMzGVTZo5/g5/DlqxD5BUW/bBetUgf7486jw6NEjxMKyKiAiiXgsIitu07/MMGfuNPNvRZOXk/\nrBfpM5rUjaV5vRr0a51Ei8QaNE+Mo3ZcNMNfXsTIN9P45J7zSYyv5uGzEZFwpwIoxf5DeWz86QY+\nM4dNWbl8tyeX/MIfz6BWJy6aFolxXHROMs0T42ju39A3qRNLVETpX6+8NKwbgybM4+4p6bx9Z0+i\nI/U1jIh4I2wLIL+wiK17D/24gT+2wc/KZW/uj+/moyKMpnXjaF4vjv5tkmnh39C3SIyjVmx0uR+3\n/VkJPD2oE2OmfstvPlnBH6/pgJlV5FMTEQlIWBTA2u+zWbptPxuPzdFn5bB1zyEKin58N1+vRjTN\n69XgkrbJP0zZNE+sQePa1Yk8wbv5U/XzTg1Zvesg42ZupG3DBIb1bFqh9y8iEoiwKIA3F2xhyoKt\nREf4aFo3lpSkeC5tV/+HKZsW9WqQEBtVqZkevKQ1a77P5nefrKRVUg16Nq9bqY8vImLOubLXChKp\nqanuVA4HvX3fIQqLHI1qxxLhC57ploNH8rnmxa/ZdyifT+7pTaPasV5HEpEqyMzSnXOpJW8Pi28g\nG9WOpWnduKDa+APUjIli0vBU8guLGPlGOofyCryOJCJhJCwKIJg1T6zB3wZ3Yc33B3n4/WWE0icy\nEQltKoAg0Ld1Eo9ceg7Tlu1i3MyNXscRkTARUAGY2RYzW25mS8ys1El4M+vrX77SzGYFOtbMHjIz\nZ2b1Tv1phL6RFzTnqs4NeeaLtfx71W6v44hIGCjPr4D6OeeySltgZrWAccClzrmtZpYUyFgzawxc\nDGwtR44qycz483Ud2ZiZw/3vLuEfo8+jZVK817FEpAqrqCmgIcCHzrmtAM65jADHPQf8P0AT30BM\nVAQTh6USE+XjzjfSOXAo3+tIIlKFBVoADvjCzNLNbGQpy1OA2mY207/O8LLGmtlAYIdzbunJHtjM\nRppZmpmlZWZmBhg3dDWsVZ3xN3dj+75DjHnnWwqL1I0icmYEWgC9nXNdgcuA0WZ2QYnlkUA34Apg\nAPCYmaWcaKyZxQL/C/y6rAd2zk10zqU651ITExMDjBvazm1Wh99f1Z7Z6zJ5avoar+OISBUVUAE4\n53b6LzOAj4DuJVbZDkx3zuX65/pnA51OMrYFcDaw1My2AI2AxWZW/3SfUFUxuHsThvdqykuzN/GP\nb3d4HUdEqqAyC8DM4sws/th14BJgRYnVPgb6mFmk/919D2D1icY655Y755Kcc82cc80oLpCuzrnv\nK+yZVQGPXdmWHmfX4ZEPlrFs+36v44hIFRPIJ4BkYK6ZLQUWAdOcc9PNbJSZjQJwzq0GpgPL/OtM\nds6tONHYM/FEqqKoCB/jhnalXo1q3PVmOhnZR7yOJCJVSFgcCyjUrdx5gEHj59O2YU3evrMH1SIj\nvI4kIiEkrI8FFOraNUzgmes7kf7dPn79j5U6XISIVAgVQIi4omMD7unXknfTtvHmgu+8jiMiVYAK\nIIT8z8Up9G+TxO8+XcX8jXu8jiMiIU4FEEJ8PuO5Gztzdr04fvFWOtv2HvI6koiEMBVAiIn3n0Og\nsMhx5xtpOoeAiJwyFUAIOrteHH8b0pV1u7N56L2l+lJYRE6JCiBE/SwlkV9e1obPln/Pi19t8DqO\niIQgFUAIu6PP2VzT5Sye+WIdX+ocAiJSTiqAEGZm/OnaDnRslMAD7y5h/e5sryOJSAhRAYS4mKgI\nXhrWjZioCO54I439h/K8jiQiIUIFUAU0SKjOS8O6snP/YcZM/ZaCwiKvI4lICFABVBHdmtbh8avb\nM2d9Fn/WOQREJADlOSewBLkbz23Cqp0HmTRnM20a1OTaro28jiQiQUyfAKqY/7uyLb2a1+XRD5ez\ndNt+r+OISBBTAVQxURE+XhzalaT4aox8M42MgzqHgIiUTgVQBdWJi2bS8FQOHi5g1JR0jhYUeh1J\nRIKQCqCKatOgJs/e0InFW/cHzTkEnHPs3H+YmWszmDh7I7/9ZCUrdhzwOpZI2NKXwFXYZR0acO+F\nLRn7nw20aRDPrb3PrpTHdc6RmX2UtbuzWbc7h/W7s1m7O5sNu3PIPvrjweuiIoy3Fn7HwwNac8f5\nzfH5rFLyiUgxFUAVd3//FFbtyuYP01aTkhzPeS3rVej978kp3tCv353Dut3Z/v9yOHA4/4d16sRF\n0yqpBtd0PYtWyfGkJNUgJTkegEc/XMYfP1vDnPVZ/OX6TiTVjKnQfCJyYjoncBjIPpLPtePmkZlz\nlE/vOZ/GdWLLfR/7D+Wx7riNfPFGf0/uj3se14yJpHX9+B838vXjSUmOp16Naie8X+ccUxdt4/f/\nXElsdCRPD+rIRW2ST+l5ikjpTnROYBVAmNiSlcvAF+bSsFZ1Prj7POKqlf7hL/tI/nHTNsfe2Wdk\nH/1hnRrVImmVXIOUpHhaJdegtX9DnxRfDbNTm8bZkJHNmKlLWL3rILf0asovL29DTFTEKd2XiBxP\nBSDMXpfJra8uYkC7+jx9fSc2Zhw/bbN+dzY7D/z4s9HqURG0Sq5Bq6R4WtevUfzOPjmehgkxp7yh\nP5mjBYU8NX0tL8/dTOvkeMYO7kLr+vEV/jgi4UYFIABMnrOJx6etPu626EgfLRNrkJLsn7ZJKt7Q\nN6pd3ZMvZmeuzeCh95Zy8EgB/3dFG4b1bHpGCkckXKgABPhxzn1PzlFaJcfTun48TerEEhFkv8DJ\nzD7Kw+8vZebaTC46J4mnBnWk7km+SxCRE1MBSMhxzvHavC386bM1JMRG8ewNnejTKtHrWCIh50QF\noB3BJGiZGbf1Ppt/jO5NQvUohr28iD9+tpq8Ah3uWqQiqAAk6LVtWJNP7zmfoT2aMHH2Jq4d/zUb\nM3O8jiUS8lQAEhKqR0fwxDUdeGlYN7bvO8yVY+fy7jdbg+IQFyKhSgUgIWVAu/pMv+8CujSpxSMf\nLGf024s5cCi/7IEi8l8CKgAz22Jmy81siZmV+i2smfX1L19pZrPKGmtmfzCzZf7bvzCzhqf/dCQc\n1E+IYcqIHjx62Tl8sXI3lz0/m4Wb9ngdSyTkBPQrIDPbAqQ657JOsLwWMA+41Dm31cySnHMZJxtr\nZjWdcwf91+8F2jrnRp0sh34FJCUt3baf+975lq17DzG6X0vuvagVURH6YCvyU2f6V0BDgA+dc1sB\njm38T+bYxt8vDtBkrpRbp8a1mHZvH67r2oi//WcDN7w0n617DnkdSyQkBFoADvjCzNLNbGQpy1OA\n2mY207/O8EDGmtkTZrYNGAr8urQHNrORZpZmZmmZmZkBxpVwElctkqev78TYwV3YkJHD5WPn8PGS\nHV7HEgl6gU4BNXTO7TSzJOBLYIxzbvZPlr8ApAIXAdWB+cAVzrl1ZY31j/8lEOOc+83JcmgKSMqy\nbe8hHnh3CWnf7ePaLmfxu6vaER8T5XUsEU+d1hSQc26n/zID+AjoXmKV7cB051yuf65/NtApwLEA\nbwPXBfZURE6scZ1Y3hnZk/v7t+IfS3Zwxdi5fLt1n9exRIJSmQVgZnFmFn/sOnAJsKLEah8Dfcws\n0sxigR7A6pONNbNWPxk/EFhzuk9GBCAywsf9/VP4+129KCxyDJownxe/2kBhkb5mEvmpQM4Ilgx8\n5D8aYyTwtnNuupmNAnDOTXDOrTaz6cAyoAiY7JxbYWbNSxvrv98nzay1f/3vgJP+AkikvFKb1eGz\n+/rwvx8t5+nP1zJ7XSbP3diZhrWqex1NJCjoYHBS5Tnn+GDxDn798QqiInz8+boOXNq+gdexRCqN\nDgYnYcvMGNStEdPu7UPTurGMmrKYX364jEN5BWUPFqnCVAASNs6uF8f7o87j7r4teOebbVz5t7ms\n2HHA61ginlEBSFiJjvTxyKXnMGVED3KPFnDNuK+ZPGeTDionYUkFIGGpd8t6/Ou+C+jbOonHp63m\nkQ+WUVCo8wxIeFEBSNiqExfNxGHduO+iVvw9bTsj30zncF6h17FEKo0KQMKamfHAxSk8fnV7Zq7N\nYMjkBezNzfM6lkilUAGIADf3bMq4od1YufMggybMY9teHVBOqj4VgIjfpe3r89YdPcjKPsp14+ex\naufBsgeJhDAVgMhPnNusDu/ffR4RPuPGl+Yzf6NONCNVlwpApISU5Hg+uPs86ifEcMsri5i2bJfX\nkUTOCBWASCka1qrOe6N60alxAvdMXcxrX2/2OpJIhVMBiJxArdho3hzRg/5tkvntp6t4avoa7TAm\nVYoKQOQkYqIiGD+0K4O7N2HczI089N4y8rXDmFQRgRwOWiSsRUb4+OM17alfM4bn/r2OPblHGTe0\nK7HR+vOR0KZPACIBMDPu69+KP13bgdnrMhk8aSF7co56HUvktKgARMphcPcmTLi5G2t2HWTQhPna\nYUxCmgpApJwuaVe8w9je3DyuHT+PlTt1SGkJTSoAkVOQ2qwO74/qRZTPuPGlBczbkOV1JJFyUwGI\nnKJWyfF88IvzaFgrhlteXcSnS3d6HUmkXFQAIqehQUJ13rvrPLo0rs2Yqd/yylztMCahQwUgcpoS\nYqN4Y0R3BrRL5vf/XMWT/9IOYxIaVAAiFSAmKoJxQ7sxtEcTJszayIPvLdUOYxL0tCeLSAWJ8BmP\nX128w9hfvlxHVk4e44d2Ja6a/swkOOkTgEgFMjPGXNSKJ6/twNz1mQyetIAs7TAmQUoFIHIG3NS9\nCROHpbJudzaDxs9j6x7tMCbBRwUgcob0b5vMW3f0ZP/hfK4dP48VO7TDmAQXFYDIGdStaW3eH9WL\napE+bnxpPnPXa4cxCR4qAJEzrGVS8RnGGteJ5bbXFvHxkh1eRxIBAiwAM9tiZsvNbImZpZ1gnb7+\n5SvNbFZZY83saTNbY2bLzOwjM6t12s9GJEjVT4jh3bt60aVJbe57ZwmT52zyOpJIuT4B9HPOdXbO\npZZc4N94jwMGOufaAdcHMPZLoL1zriOwDvhl+aKLhJaE6lG8cXt3Lmtfn8enreaPn62mqEg7jIl3\nKmoKaAjwoXNuK4BzLqOsAc65L5xzBf5/LgAaVVAWkaAVExXBC0O6MrxXUybO3sT//H0JeQXaYUy8\nEWgBOOALM0s3s5GlLE8BapvZTP86w8sxFuB24F+lLTCzkWaWZmZpmZmZAcYVCV4RPuN3A9vx8IDW\n/GPJTka8/g05RwvKHihSwQItgN7Oua7AZcBoM7ugxPJIoBtwBTAAeMzMUgIZa2b/CxQAb5X2wM65\nic65VOdcamJiYoBxRYKbmTG6X0ueuq4j8zbuYfDEBWRma4cxqVwBFYBzbqf/MgP4COheYpXtwHTn\nXK5zLguYDXQqa6yZ3QJcCQx1OnqWhKEbzm3MpOHdWJ+RzdUvfs3HS3boewGpNGUWgJnFmVn8sevA\nJcCKEqt9DPQxs0gziwV6AKtPNtbMLgUeofiLY+0mKWHrwnOSeWdkL+JjIrnvnSVcPnYO/161W0cU\nlTMukE8AycBcM1sKLAKmOeemm9koMxsF4JxbDUwHlvnXmeycW3Gisf77fQGIB770/0R0QoU+M5EQ\n0rlxLT67tw/P39SZI/mF3PFGGteNn8f8jXu8jiZVmIXSu4zU1FSXllbqbggiVUZ+YRHvpW1n7Iz1\nfH/wCH1a1ePhAa3p2KiW19EkRJlZeqk/4VcBiASnI/mFvDn/O8bN3MC+Q/lc2q4+Dw1IoWVSvNfR\nJMSoAERCVPaRfCbP2czkOZs4nF/INV0acX//VjSuE+t1NAkRKgCRELc3N4/xMzfw+vzvcM4xtEdT\nftGvBUnxMV5HkyCnAhCpInYdOMzYGRv4e9o2oiN83Na7GXdd0IKE2Civo0mQUgGIVDGbs3J57st1\nfLJ0JzVjIrnrZy24rXczYqN1Cko5ngpApIpatfMgf/liLTPWZFCvRjXGXNiSwd2bEB2po71LMRWA\nSBWXtmUvT32+lkWb99KodnXu75/CNV3OIsJnXkcTj52oAPQWQaSKSG1Wh3dH9uSN27tTOzaah95b\nyoC/zmb6il3aq1hKpQIQqULMjAtSEvnknt6MH9oV5xyjpizmqhe/Zs76TBWBHEcFIFIFmRmXdWjA\n5/dfwNODOrInJ49hLy9i8KQFpH+3z+t4EiT0HYBIGDhaUMjUhVt54asNZOXk0b9NEg8NaM059Wt6\nHU0qgb4EFhFyjxbw2rwtTJi1kZyjBQzs1JD/uTiFpnXjvI4mZ5AKQER+cOBQPhNmb+TVrzdTUOi4\n4dzG3HthK+onaK/iqkgFICL/JePgEV74agNTF23FZ8Yt5zXj7p+1oHZctNfRpALpZ6Ai8l+Sasbw\n+6va858H+3JFxwZMmrOJy8fO4bs9uV5Hk0qgAhARGteJ5dkbOvPJ6PM5kl/I4IkL2LZXJ+qr6lQA\nIvKDDo0SmHJHD3LzCrlp4gK271MJVGUqABE5TruGCbx1Rw+yj+QzeNICduw/7HUkOUNUACLyX9qf\nVfxJYP+hfIZMWsCuAyqBqkgFICKl6tioFm/c3p29OXkMmbSQ3QePeB1JKpgKQEROqEuT2rx2e3cy\nDh5h8MQFZKgEqhQVgIicVLemtXn99u58f/AIgyctIDP7qNeRpIKoAESkTKnN6vDqreeyc/8Rhkxa\nQFaOSqAqUAGISEB6NK/LK7eey7Z9hxg6aSF7c/O8jiSnSQUgIgHr1aIuL99yLlv25DJk0gL2qQRC\nmgpARMqld8t6TL4llU1ZuQydvJD9h1QCoUoFICLl1qdVIhOHdWNDRg7DXl7EgUP5XkeSU6ACEJFT\n0rd1Ei8N68aa7w8y/JWFHDyiEgg1ARWAmW0xs+VmtsTMSj0es5n19S9faWazyhprZtf71y0ys/86\nTKmIBL9+5yQxfmg3Vu06yC2vLCJbJRBSyvMJoJ9zrnOpJxUwqwWMAwY659oB1wcwdgVwLTC7nJlF\nJIj0b5vMC0O6snz7AW599RtyjhZ4HUkCVFFTQEOAD51zWwGccxllDXDOrXbOra2gxxcRDw1oV5+/\nDe7Ckm37ue3VReSqBEJCoAXggC/MLN3MRpayPAWobWYz/esML8fYkzKzkWaWZmZpmZmZ5R0uIpXk\nsg4NGHtTFxZv3c9tr33DoTyVQLALtAB6O+e6ApcBo83sghLLI4FuwBXAAOAxM0sJcOxJOecmOudS\nnXOpiYmJ5RkqIpXsio4NeO7GzqRt2cuI19I4nFfodSQ5iYAKwDm303+ZAXwEdC+xynZgunMu1zmX\nRfG8fqcAx4pIFTKwU0OevaEzCzfv4c430jiSrxIIVmUWgJnFmVn8sevAJRR/gftTHwN9zCzSzGKB\nHsDqAMeKSBVzdZezeHpQJ77emKUSCGKBfAJIBuaa2VJgETDNOTfdzEaZ2Sgo/kIXmA4s868z2Tm3\n4kRjAczsGjPbDvQCppnZ5xX95ETEO9d1a8Sfr+vI3A1ZjJqSztEClUCwMeec1xkClpqa6tLSSt0N\nQUSC1DuLtvLoh8u56Jwkxt3clWqREV5HCjtmll7aT/i1J7CInFE3dW/CE9e0Z8aaDO55+1vyCoq8\njiR+KgAROeOG9mjKH65qx5erdjNm6mLyC1UCwUAFICKVYlivZvz25235fOVu7nvnW5VAEIj0OoCI\nhI9be59NoYM//HMVPlvCX2/sTGSE3od6RQUgIpVqxPlnU1TkeOKz1UT4jGdv6EyEz7yOFZZUACJS\n6e68oDkFRY4/T19DhBlPX99JJeABFYCIeOLuvi0oco6nP1+Lz2c8dV1HfCqBSqUCEBHPjO7XkoJC\nx3P/XkeEGX+6toNKoBKpAETEU/f1b0Whc4ydsR6fz3ji6vYqgUqiAhARzz3QvxWFRUW8+NVGInzw\nh6vaY6YSONNUACLiOTPjoUtaU1gEE2ZtJMKM3w5spxI4w1QAIhIUzIxHLm1NYVERk+ZsJsLn47Er\n26gEziAVgIgEDTPjV5e3obAIXvl6M9GRPh65tLVK4AxRAYhIUDEzHruyDXmFhUyYtZFqkT4euDil\n7IFSbioAEQk6ZsbvB7Ynr6CI52esJzrSx+h+Lb2OVeWoAEQkKPl8xp+u7Uh+YfHOYtUifdzRp7nX\nsaoUFYCIBK0In/H0oI7kFRTx+LTVREf6GN6rmdexqgwVgIgEtcgIH3+9qTN5hUX8+uOVREf4uKl7\nE69jVQk6DquIBL2oCB8vDOlCv9aJ/PKj5XyQvt3rSFWCCkBEQkK1yAjG39yN3i3q8fD7S/l06U6v\nI4U8FYCIhIyYqAgmDU8ltVkd7n93CdNXfO91pJCmAhCRkFI9OoJXbj2XTo0SGDN1MTNW7/Y6UshS\nAYhIyKlRLZLXbu9OmwY1uXvKYmavy/Q6UkhSAYhISKoZE8Ubt3enRVIN7nwjjfkb93gdKeSoAEQk\nZNWKjWbKiO40rRvLiNe/IW3LXq8jhRQVgIiEtLo1qjHljh7UrxnDra9+w5Jt+72OFDJUACIS8pLi\nY3j7zp7UiYtm+MsLWbHjgNeRQoIKQESqhPoJMbx9Zw/iY6IY9vJC1nx/0OtIQS+gAjCzLWa23MyW\nmFnaCdbp61++0sxmlTXWzOqY2Zdmtt5/Wfv0n46IhLNGtWN5+84eVIuM4ObJC9mQkeN1pKBWnk8A\n/ZxznZ1zqSUXmFktYBww0DnXDrg+gLGPAjOcc62AGf5/i4iclqZ143jrzh6AMWTSArZk5XodKWhV\n1BTQEOBD59xWAOdcRgBjrgJe919/Hbi6grKISJhrkViDt+/sQUGRY8ikBWzbe8jrSEEp0AJwwBdm\nlm5mI0tZngLUNrOZ/nWGBzA22Tm3C8B/mVTaA5vZSDNLM7O0zEzt7CEigUlJjmfKiB7k5hUyZPIC\ndu4/7HWkoBNoAfR2znUFLgNGm9kFJZZHAt2AK4ABwGNmlhLg2JNyzk10zqU651ITExPLM1REwlzb\nhjV5c0R39ufmM3TyQjIOHvE6UlAJqACcczv9lxnAR0D3EqtsB6Y753Kdc1nAbKBTGWN3m1kDAP9l\nINNGIiLl0rFRLV67vTsZB48wZPJCsnKOeh0paJRZAGYWZ2bxx64DlwArSqz2MdDHzCLNLBboAawu\nY+wnwC3+67f470NEpMJ1a1qbV249lx37DnPz5IXsy83zOlJQCOQTQDIw18yWAouAac656WY2ysxG\nATjnVgPTgWX+dSY751acaKz/fp8ELjaz9cDF/n+LiJwRPZrXZfItqWzKymXYKws5cDjf60ieM+ec\n1xkClpqa6tLSSt0NQUQkIF+tzeCuN9J/+H4gPibK60hnnJmll/YTfu0JLCJhpV/rJF4Y0oUVOw5w\n+2vfkHu0wOtInlEBiEjYuaRdfZ6/qQvp3+3jjtfTOJJf6HUkT6gARCQsXdGxAc/e0JkFm/cw8s30\nsCwBFYCIhK2ru5zFn6/tyOx1mYx+azF5BUVeR6pUKgARCWs3nNuYx69uz4w1Gdw79VsKCsOnBFQA\nIhL2bu7ZlF9f2ZbpK7/ngb8vpbAodH4deToivQ4gIhIMbj//bPIKi3jyX2uIjvDx9KCO+Hzmdawz\nSgUgIuI36mctyCso4tkv1xEd6eOP17THrOqWgApAROQnxlzYkqMFhbz41UaqRfr4zc/bVtkSUAGI\niPyEmfHQJa3JKyhi0pzNbN93mN/8vC2N68R6Ha3C6UtgEZESzIxfXd6GX11+Dl9vyKL/s7N4/t/r\nq9y+AioAEZFSmBkjL2jBjAd/Rv+2yTz373Vc8txsZqze7XW0CqMCEBE5iYa1qvPikK68dUcPoiN9\njHg9jRGvfcPWPaF/mkkVgIhIAHq3rMdn9/bhV5efw4JNe+j/3Cye/XJdSE8LqQBERAIUHenzTwv1\n5dJ29Rk7Yz39n53FFyu/J5QOrX+MCkBEpJzqJ8QwdnAXpt7Zk9joCEa+mc5tr33D5qxcr6OViwpA\nROQU9WpRl2n39uGxK9uStmUfA56bzdOfr+FQXmicY0AFICJyGqIifIw4/2z+89DPuLJjA178aiP9\n/zKLfy3fFfTTQioAEZEKkBQfw7M3dua9Ub2oWT2Ku99azPBXFrExM8fraCekAhARqUDnNqvDP8ec\nz+8GtmPJtv1c+tfZPPmvNUF56kkVgIhIBYuM8HHLec346qG+XN35LCbM2shFf5nFp0t3BtW0kApA\nROQMqVejGk9f34kP7j6PujWiGTP1W4ZOXsj63dleRwNUACIiZ1y3prX55J7z+cPV7Vm58yCXPT+H\nJ6atIsfjaSEVgIhIJYjwGcN6NuWrh/oyqFsjJs/dzIXPzOTjJTs8mxZSAYiIVKI6cdE8eV1HPvpF\nb+onxHDfO0u4ceIC1nx/sNKzqABERDzQuXEtPvpFb/50bQfW787mirFz+d2nKzl4JL/SMqgAREQ8\nEuEzBndvwn8e7MtN5zbmtXlbuPCZWXyQvr1SpoVUACIiHqsdF80T13Tgk9Hn06h2dR58bynXT5jP\nqp1ndlpIBSAiEiQ6NErgw7vP46nrOrIpK5cr/zaH33y8ggOHz8y0UEAFYGZbzGy5mS0xs7QTrNPX\nv3ylmc0qsSzCzL41s3/+5LZOZjbff7+fmlnN03sqIiKhz+czbji3MV892JebezblzQXfceEzM5m3\nMaviH6sc6/ZzznV2zqWWXGBmtYBxwEDnXDvg+hKr3AesLnHbZOBR51wH4CPg4XJkERGp0hJio/j9\nVe35dMz5tG1Yk+b1alT4Y1TUFNAQ4EPn3FYA51zGsQVm1gi4guIN/k+1Bmb7r38JXFdBWUREqox2\nDRN4c0QP6ifEVPh9B1oADvjCzNLNbGQpy1OA2mY207/O8J8s+yvw/4CiEmNWAAP9168HGpf2wGY2\n0szSzCwtMzMzwLgiIlKWQAugt3OuK3AZMNrMLiixPBLoRvE7/QHAY2aWYmZXAhnOufRS7vN2/32l\nA/FAXmkP7Jyb6JxLdc6lJiYmBhhXRETKEhnISs65nf7LDDP7COjOj9M3ANuBLOdcLpBrZrOBTkBX\nYKCZXQ7EADXNbIpz7mbn3BrgEgAzS6G4PEREpJKU+QnAzOLMLP7YdYo32itKrPYx0MfMIs0sFugB\nrHbO/dI518g51wy4CfiPc+5m/30l+S99wP8BEyroOYmISAAC+QSQDHxkZsfWf9s5N93MRgE45yY4\n51ab2XRgGcVz/ZOdcyVLoqTBZjbaf/1D4NVTegYiInJKLJhOTlCW1NRUl5ZW6m4IIiJyAmaWXtpP\n+LUnsIhImFIBiIiEqZCaAjKzTOA7r3OcpnpAxe/THbr0evxIr8Xx9Hoc73Rej6bOuf/6HX1IFUBV\nYGZppc3FhSu9Hj/Sa3E8vR7HOxOvh6aARETClApARCRMqQAq30SvAwQZvR4/0mtxPL0ex6vw10Pf\nAYiIhCl9AhARCVMqABGRMKUCqCRm1tjMvjKz1f7TZt7ndSavlXaq0HBlZrXM7H0zW+P/f6SX15m8\nYmYP+P9GVpjZVDOr+DOhBDEze8XMMsxsxU9uq2NmX5rZev9l7Yp4LBVA5SkAHnTOtQF6UnwuhLYe\nZ/JaaacKDVfPA9Odc+dQfCj1sHxdzOws4F4g1TnXHoig+EjC4eQ14NIStz0KzHDOtQJm+P992lQA\nlcQ5t8s5t9h/PZviP/CzvE3lnZOcKjTsmFlN4ALgZQDnXJ5zbr+nobwVCVQ3s0ggFtjpcZ5K5Zyb\nDewtcfNVwOv+668DV1fEY6kAPGBmzYAuwEKPo3jpr5R+qtBw1BzIBF71T4lN9p97I+w453YAzwBb\ngV3AAefcF96mCgrJzrldUPxmEkiqiDtVAVQyM6sBfADc75w76HUeL5RxqtBwFEnx2fPGO+e6ALlU\n0Ef8UOOf274KOBtoCMSZ2c3epqq6VACVyMyiKN74v+Wc+9DrPB7qTfGpQrcA7wAXmtkUbyN5ajuw\n3Tl37BPh+xQXQjjqD2x2zmU65/IpPlnUeR5nCga7zawBgP8yoyLuVAVQSaz4lGovU3yqzGe9zuOl\nk50qNBw5574HtplZa/9NFwGrPIzkpa1ATzOL9f/NXESYfiFewifALf7rt1B8Gt7TFtBJ4aVC9AaG\nAcvNbIn/tl855z7zLpIEkTHAW2YWDWwCbvM4jyeccwvN7H1gMcW/nPuWMDskhJlNBfoC9cxsO/Ab\n4Eng72Y2guKSvL5CHkuHghARCU+aAhIRCVMqABGRMKUCEBEJUyoAEZEwpQIQEQlTKgARkTClAhAR\nCVP/HxJtZpaU1GP0AAAAAElFTkSuQmCC\n",
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEHCAYAAACjh0HiAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90\nbGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsT\nAAALEwEAmpwYAAAuJklEQVR4nO3deXxU5dn/8c9FCAkJSYCwE5aALEZ2Aiii4gouxaWiqK21tbXa\nR+1qaxdb/XXz0fZprXUp7rZ9aq1baev21LWKKCCLgAIhoAQChABZCAlJuH5/nEkMIQlDksmEzPf9\nes0rc5Y5cyWQfOe+73PuY+6OiIjErk7RLkBERKJLQSAiEuMUBCIiMU5BICIS4xQEIiIxTkEgIhLj\nOkfy4GY2G7gLiAMedPfb622/CbiiTi3HAr3dfVdjx+zVq5cPHTo0MgWLiHRQS5cu3enuvRvaZpG6\njsDM4oB1wJlAHrAYuMzd1zSy/2eAb7r7aU0dNzs725csWdLa5YqIdGhmttTdsxvaFsmuoalAjrvn\nuvt+4Ang/Cb2vwz4SwTrERGRBkQyCAYCm+ss54XWHcLMkoDZwNONbL/GzJaY2ZKCgoJWL1REJJZF\nMgisgXWN9UN9Bni7sbEBd5/v7tnunt27d4NdXCIi0kyRHCzOAwbVWc4Atjay7zzULSTS4VRWVpKX\nl0d5eXm0S4kZiYmJZGRkEB8fH/ZrIhkEi4ERZpYJbCH4Y395/Z3MLA04BfhcBGsRkSjIy8sjJSWF\noUOHYtZQJ4G0JnensLCQvLw8MjMzw35dxLqG3L0KuB54CfgQeNLdV5vZtWZ2bZ1dLwRedve9kaoF\n4MVV2xj7k5fILSiN5NuISB3l5eWkp6crBNqImZGenn7ELbCIXkfg7s8Dz9dbd3+95UeBRyNZB0B8\nnFFSUUVJeVWk30pE6lAItK3m/Lxj5sribglB5pVWKAhEROqKmSBISQwGTtQiEIkdcXFxTJgwgTFj\nxjB37lzKysqafaznnnuONWsavB62SQsWLOD2229vcp+tW7dy8cUXN7e0FouhIAhaBCXllVGuRETa\nSteuXVm+fDmrVq2iS5cu3H//QT3TVFdXh32spoKgqqrxD5hz5szh5ptvbvLYAwYM4Kmnngq7ltYW\nM0GgriGR2HbSSSeRk5PD66+/zqmnnsrll1/O2LFjqa6u5qabbmLKlCmMGzeOP/zhD4e8duHChSxY\nsICbbrqJCRMmsGHDBmbOnMkPfvADTjnlFO666y7+8Y9/MG3aNCZOnMgZZ5zB9u3bAXj00Ue5/vrr\nAbjqqqu48cYbmT59OsOGDav9479p0ybGjBlTu/9FF13E7NmzGTFiBN/97ndr63jooYcYOXIkM2fO\n5Ctf+UrtcVsqooPF7Um3UIugVF1DIlFx2z9Ws2ZrcaseM2tAKj/5zHGH3a+qqooXXniB2bNnA/De\ne++xatUqMjMzmT9/PmlpaSxevJiKigpOPPFEzjrrrINOv5w+fTpz5szhvPPOO6gLZ8+ePbzxxhsA\n7N69m0WLFmFmPPjgg9xxxx38+te/PqSW/Px83nrrLT766CPmzJnTYJfQ8uXLWbZsGQkJCYwaNYob\nbriBuLg4fvrTn/L++++TkpLCaaedxvjx44/4Z9aQmAmC+LhOJMZ3okQtApGYsW/fPiZMmAAELYKr\nr76ahQsXMnXq1No/9C+//DIrV66s/XReVFTE+vXrwzoP/9JLL619npeXx6WXXkp+fj779+9v9PUX\nXHABnTp1Iisrq7bVUN/pp59OWloaAFlZWXz88cfs3LmTU045hZ49ewIwd+5c1q1bF94P4jBiJggA\nuiXEa7BYJErC+eTe2mrGCOpLTk6ufe7u3H333cyaNeugfX74wx/yr3/9C6DBY9Q/zg033MC3vvUt\n5syZw+uvv86tt97a4GsSEhIOeu/D7RMXF0dVVVWj+7aGmBkjAEhN7KwxAhE5yKxZs7jvvvuorAxO\nJFm3bh179+7l5z//OcuXL68NgZSUFEpKSho9TlFREQMHBvNqPvbYY61e59SpU3njjTfYvXs3VVVV\nPP10g3N0NktMBUG3xM6U6qwhEanjy1/+MllZWUyaNIkxY8bw1a9+tcGzgObNm8edd97JxIkT2bBh\nwyHbb731VubOnctJJ51Er169Wr3OgQMH8oMf/IBp06ZxxhlnkJWVVdt91FIRuzFNpLTkxjSXP7CI\n/VUHeOq66a1clYg05MMPP+TYY4+NdhkdRmlpKd26daOqqooLL7yQL33pS1x44YWH7NfQzz1aN6Zp\nd1LUNSQiR7Fbb7219gK5zMxMLrjgglY5rgaLRUSOEr/61a8ictyYaxHoymIRkYPFVBB0Swi6ho62\ncRERkUiKqSBISezMAYd9leHPLyIi0tHFVBB0q514TuMEIiI1YisIEhQEIrEmLy+P888/nxEjRjB8\n+HC+/vWvs3///mYf7/XXX+e8885r1mvDmZI6GmIqCFJD9yTQKaQiscHdueiii7jgggtYv34969at\no7S0lB/+8IdRqSecKamjIaaCQDOQisSWV199lcTERL74xS8Cwbw9v/nNb3j44YcpKytrcsrnul58\n8UVGjx7NjBkzeOaZZ2rX7927ly996UtMmTKFiRMn8ve//x2AadOmsXr16tr9Zs6cydKlSw+aknr7\n9u1ceOGFjB8/nvHjx7Nw4UIA/vSnPzF16lQmTJjAV7/61SO6Z0Jzxdh1BLo5jUjUvHAzbPugdY/Z\nbyyc3XhXy+rVq5k8efJB61JTUxk8eDA5OTlAw1M+Dxo0qHb/8vJyvvKVr/Dqq69yzDHHHDTj6M9/\n/nNOO+00Hn74Yfbs2cPUqVM544wzmDdvHk8++SS33XYb+fn5bN26lcmTJ/PBB59+/zfeeCOnnHIK\nzz77LNXV1ZSWlvLhhx/y17/+lbfffpv4+Hi+9rWv8ec//5krr7yytX5iDYqpFkHtXcrUNSQSE9y9\nwZu5111fM+VzYmJi7ZTPdX300UdkZmYyYsQIzIzPfe5ztdtefvllbr/9diZMmMDMmTMpLy/nk08+\n4ZJLLuFvf/sbAE8++SRz5849pIZXX32V6667DghaKmlpabzyyissXbqUKVOmMGHCBF555RVyc3Nb\n7efRmJhqEaQkhMYI1DUk0vaa+OQeKccdd9whs3QWFxezefNmhg8fztKlSxuc8rm+hsIEgkB5+umn\nGTVq1CHb0tPTWblyJX/9618bvOtZY8f7whe+wC9/+cuw9m8tEW0RmNlsM1trZjlm1uAIiZnNNLPl\nZrbazN6IZD3JCXGAzhoSiRWnn346ZWVlPP7440Bwj+Jvf/vbXHXVVSQlJYV1jNGjR7Nx48baGUf/\n8pe/1G6bNWsWd999d+1FqsuWLavdNm/ePO644w6KiooYO3Zsg7Xdd999tXUVFxdz+umn89RTT7Fj\nxw4Adu3adUgLJRIiFgRmFgfcA5wNZAGXmVlWvX26A/cCc9z9OODQ9lMr6hzXia7xcZRWaIxAJBaY\nGc8++yx/+9vfGDFiBCNHjiQxMZFf/OIXYR8jMTGR+fPnc+655zJjxgyGDBlSu+2WW26hsrKScePG\nMWbMGG655ZbabRdffDFPPPEEl1xySYPHveuuu3jttdcYO3YskydPZvXq1WRlZfGzn/2Ms846i3Hj\nxnHmmWeSn5/f/B9AmCI2DbWZnQDc6u6zQsvfB3D3X9bZ52vAAHf/UbjHbck01ABTf/5vTj+2D7+8\naFyzjyEi4dE01NHRnqahHghsrrOcF1pX10igh5m9bmZLzazBoXEzu8bMlpjZkoKCghYV1S2xM8Xq\nGhIRqRXJIGhodKV+86MzMBk4F5gF3GJmIw95kft8d8929+zevXu3qKiUhM4aLBYRqSOSZw3lAYPq\nLGcAWxvYZ6e77wX2mtmbwHhgXaSKSkmM15XFIm2osVM4JTKa090fyRbBYmCEmWWaWRdgHrCg3j5/\nB04ys85mlgRMAz6MYE3BVNRqEYi0icTERAoLCzX1extxdwoLC0lMTDyi10WsReDuVWZ2PfASEAc8\n7O6rzeza0Pb73f1DM3sRWAkcAB5091WRqgmCMQJdWSzSNjIyMsjLy6OlY3sSvsTERDIyMo7oNRG9\noMzdnweer7fu/nrLdwJ3RrKOulISO+vKYpE2Eh8fT2ZmZrTLkMOIqSkmIDRYrLuUiYjUirkg6JbY\nGXfYu193KRMRgRgMgpREzTckIlJXzAVBzVTUmmZCRCQQc0GQ1jVoEezaqyAQEYEYDILBPYMZBz8u\n3BvlSkRE2oeYC4KBPboS18n4ZFdZtEsREWkXYi4I4uM6MbB7VzYVKghERCAGgwBgSHqSuoZEREJi\nMgiGpiezaaeCQEQEYjQIhqQnUVxexZ6y/dEuRUQk6mI0CJIBNE4gIkKMBsHQdJ1CKiJSIyaDYFDP\nJMxg0061CEREYjIIEuPj6JeayMe71CIQEYnJIICaU0jVIhARidkgGJqerDECERFiOAiGpCezs3S/\nbmQvIjEvhoMgOHNIF5aJSKyL2SAY3S8FgJV5RVGuREQkumI2CDJ7JdMnJYFFuYXRLkVEJKo6N7bB\nzIoP81oD8t19ZOuW1DbMjBOGp7NwQyHujplFuyQRkahoqkWwwd1Tm3ikAE12sJvZbDNba2Y5ZnZz\nA9tnmlmRmS0PPX7c0m/oSBw/LJ2CkgpyNU4gIjGs0RYB8NkwXt/oPmYWB9wDnAnkAYvNbIG7r6m3\n63/c/bww3qvVHT8sHYBFuYUM790tGiWIiERdoy0Cd88FMLPrzaxHU/s0YiqQ4+657r4feAI4vyXF\ntrah6Un0S01kUe6uaJciIhI14QwW9yP4NP9kqKsn3M70gcDmOst5oXX1nWBmK8zsBTM7rqEDmdk1\nZrbEzJYUFBSE+faHZ2YcP6wn74TGCUREYtFhg8DdfwSMAB4CrgLWm9kvzGz4YV7aUGDU/2v7PjDE\n3ccDdwPPNVLDfHfPdvfs3r17H67kI3L8sHR2llawoaC0VY8rInK0COv0UQ8+Lm8LPaqAHsBTZnZH\nEy/LAwbVWc4AttY7brG7l4aePw/Em1mv8MtvuRkjgrd7cdW2tnxbEZF247BBYGY3mtlS4A7gbWCs\nu18HTKbpAeXFwAgzyzSzLsA8YEG9Y/er6Woys6mhetr0xP6MHkkcP6wnTy3NU/eQiMSkcFoEvYCL\n3H2Wu//N3SsB3P0A0OjZPu5eBVwPvAR8CDzp7qvN7Fozuza028XAKjNbAfwOmOdR+Gs8d/IgNhWW\nsXjT7rZ+axGRqLNw/u6a2SRgBkEf/9vu/n6kC2tMdna2L1mypFWPWba/iik/+zfnjO3PnXPHt+qx\nRUTaAzNb6u7ZDW0Lp2voFuAxIJ2gdfCImf2odUuMrqQunTl3XH/+9UE+ezUbqYjEmHC6hi4Hprj7\nT9z9J8DxwBWRLavtXZI9iLL91Tz9fl60SxERaVPhBMEmILHOcgKwISLVRNHkIT2YMrQHv381h337\nq6NdjohImwknCCqA1Wb2qJk9AqwCSs3sd2b2u8iW13bMjJtmjWZHSQV/XLQp2uWIiLSZpuYaqvFs\n6FHj9ciUEn1TM3ty8sje3Pv6Bi6bOpiUxPholyQiEnGHDQJ3fyx0HUDNdNNra04h7YhuOmsUn/n9\nW9z17/X86LysaJcjIhJx4Zw1NBNYTzCT6L3AOjM7ObJlRc/YjDQunzaYh9/eyIrNe6JdjohIxIUz\nRvBr4Cx3P8XdTwZmAb+JbFnRdfPZo+mdksD3nl7J/qoD0S5HRCSiwgmCeHdfW7Pg7uuADt15npoY\nz88uGMtH20r47b/XRbscEZGICicIlprZQ6G7ic00sweApZEuLNrOzOrLvCmDuPf1Dbz60fZolyMi\nEjHhBMG1wGrgRuDrwJrQug7v1jnHkdU/lW/+dQWbd5VFuxwRkYhoMgjMrBOw1N3/x90vcvcL3f03\n7l7RRvVFVWJ8HPd9bhLuzlWPvMfuvfujXZKISKtrMghCM4yuMLPBbVRPuzMkPZkHrsxm8+59XP3Y\nYl11LCIdTjhdQ/0Jrix+xcwW1DwiXVh7Mm1YOnddOoFlm/dwzR+XUF6pMBCRjiOcK4tvi3gVR4Gz\nx/bnvy8ax/eeWclXHl/CA1dmkxgfF+2yRERaLJwWwTnu/kbdB3BOpAtrjy6ZMoj/vmgcb+Xs5Jo/\nLqWiSi0DETn6hRMEZzaw7uzWLuRoccmUQdx+0VjeXFfADf+7jMpqXXAmIke3RoPAzK4zsw+AUWa2\nss5jI/BB25XY/lw6ZTC3fiaLl9ds58a/LFPLQESOak2NEfwv8ALwS+DmOutL3H1XRKs6Clx1YibV\nDj/95xqKHlnM/Cuz6ZYQzpCLiEj70miLwN2L3H2Tu18G5AGVBPcs7hbLp5PWdfWMTH49dzzvbtzF\nFQ++S9G+Djspq4h0YOHMPno9sB34P+Bfocc/I1zXUeOzkzO474pJrNlaxOcfepeiMoWBiBxdwhks\n/gYwyt2Pc/exoce4cA5uZrPNbK2Z5ZjZzU3sN8XMqs3s4jDrblfOOq4f939uMh/ll3Dp/HfYXlwe\n7ZJERMIWThBsBoqO9MBmFkdwD4OzgSzgMjM75E4vof3+G3jpSN+jPTn92L48dFU2m3eVcdG9C9lQ\nUBrtkkREwhJOEOQCr5vZ983sWzWPMF43Fchx91x33w88AZzfwH43AE8DO8Kuup06aURvnrjmBCqq\nqrngnrc1a6mIHBXCCYJPCMYHugApdR6HM5CgNVEjL7SulpkNBC4E7m/qQGZ2jZktMbMlBQUFYbx1\n9IzNSOPZr53I4J5JfOnRJfzqpbW6uY2ItGvh3LP4kCkmzCyc8yStocPVW/4t8D13rzZraPfaGuYD\n8wGys7PrH6PdGdQziaevm84tz63i96/l8O8Pt/OrueMZMzAt2qWJiByiqQvK3qrz/I/1Nr8XxrHz\ngEF1ljOArfX2yQaeMLNNwMXAvWZ2QRjHbvcS4+O4c+54Hrgym8K9+zn/nre548WPNGGdiLQ7TXUN\nJdd5PqbetsY/vn9qMTDCzDLNrAswDzho1lJ3z3T3oe4+FHgK+Jq7PxfGsY8aZ2b15d/fPIXPThrI\nva9v4Oy7/sNb63dGuywRkVpNBYE38ryh5UNf7F4FXE9wNtCHwJPuvtrMrjWzmLjDWY20pHjuuHg8\nf7p6Gu7O5x56l+v+tJRVW474ZCwRkVZn7g3/TTezXODbBGFxJ/Cdmk3AHe4+vE0qrCc7O9uXLFkS\njbduFeWV1fzhjVwe+E8upRVVnDKyN985axRjMzR+ICKRY2ZL3T27wW1NBMEjTR3U3b/YCrUdsaM9\nCGoU7avkT4s+5oH/5LKnrJKzx/TjyydlMmlwD5oaOBcRaY5mBUF71VGCoEZxeSUPvJnLows3UVJe\nxfhB3fnxeVlMHtIj2qWJSAeiIDgK7K2o4tllW/j9qzlsKy5n7uQMvjNrFH1TE6Ndmoh0AAqCo0hp\nRRW/e2U9j7y9kbhOxpdnDOPamcM1xbWItEhTQRDOlcXShroldOYH5xzLv791Cmdm9eP3r+Uw887X\neeK9T6g+cHSFtogcHcKZhnqumaWEnv/IzJ4xs0mRLy22DUlP5u7LJvLcf53I4J5dufmZDzj7rjd5\n5cPtHG2tOBFp38JpEdzi7iVmNgOYBTwG3BfZsqTGhEHdefq66dxz+SQqq52rH1vCFQ++y5qtxdEu\nTUQ6iHCCoGZOhHOB+9z97wQT0EkbMTPOHdefl795MrfNOY41+cWce/d/+OZfl/NJYVm0yxORo9xh\nB4vN7J/AFuAMYDKwD3jP3cdHvrxDdfTB4nAUlVVy7xs5PLZwE1XVztlj+3PV9CG6BkFEGtWis4bM\nLAmYDXzg7uvNrD8w1t1fbv1SD09B8KntxeXMfzOXJxdvpqSiiuMGpPKFE4YyZ8IAEuPjol2eiLQj\nLQ2C4UCeu1eY2UxgHPC4u+9p5TrDoiA41N6KKp5ZtoU/vrOJddtL6ZeayNdOHc4l2YMUCCICtDwI\nlhNMFz2UYAK5BQT3MD6ndcsMj4Kgce7O2zmF3PXKOhZv2k2vbl248oShXHnCELonaVhHJJa1NAje\nd/dJZvZdYJ+7321my9x9YiSKPRwFweG5O4tydzH/zQ28traApC5xXD51MF+akcmA7l2jXZ6IREFT\nQRDO5aqVZnYZcCXwmdC6+NYqTlqfmXHC8HROGJ7O2m0l3P/GBh5ZuIlHF27i3HH9uebkYRw3QLOd\nikggnBZBFnAt8I67/8XMMoFL3f32tiiwPrUImidvdxmPvr2JJxZvprSiijOO7cNXTxlO9hCdaSQS\nC1o811DoDmMjQ4tr3b2yFes7IgqClinaV8njCzfx0Nsb2VNWSVb/VC6bOoizx/anV7eEaJcnIhHS\n0jGCmQRXE28iuCnNIOAL7v5mq1YZJgVB6yjbX8Vzy7by+Dub+GhbCXGdjOnD0zlvXH9mH9eftCT1\n/ol0JC0NgqXA5e6+NrQ8EviLu09u9UrDoCBoXe7O2u0l/GPFVv65Mp+PC8voEteJU0f35vwJA5k5\nqjdJXTTzqcjRrqWDxfE1IQDg7uvMTB8XOwgzY3S/VEb3S+U7Z41iZV4Rzy3fwj9WbOWl1dtJjO/E\nqaP6cMHEIBQSOuu6BJGOJpwWwSPAAeCPoVVXAJ11q8qOrar6AO9t2sWLq7bxr5X5FO7dT0piZ844\nti+zjuvHjBG9dI8EkaNIS7uGEoD/AmYQjBG8Cdzr7hWtXWg4FARtr6r6AG/l7ORfK/N5ec12ivZV\nEh9nZA/pydTMnmQP7cGYAWn0SNZFayLtVbODwMw6ASvdfUykijtSCoLoqqw+wJJNu3lt7Q7eztnJ\nmvxiav4LDe6ZxOnH9uGMY/sycXB3jS2ItCPNHiNw9wNmtsLMBrv7J81449nAXUAc8GD9aw/M7Hzg\npwRdT1XAN9z9rSN9H2k78XGdai9WAygur2Tl5iJWby3i3Y27+PO7n/DI25vo3Mk4bmAaM45JZ8Yx\nvZk4uLvmPRJpp8LpGnoVmAK8B+ytWe/ucw7zujhgHXAmkAcsBi5z9zV19ukG7HV3N7NxwJPuPrqp\n46pF0L7traji3Y2FLNm0m3c37mL55j1UH3C6dO7EpMHdOX5YOtOH92Li4O7Ex+lOqSJtpaVnDd3W\nzPedCuS4e26oiCeA84HaIHD30jr7JwO6B+NRLjmhM6eN7stpo/sCUFJeybu5u3h3YyHv5BZy1yvr\n+e2/15OS0JkTj+nFicekc8LwXgzvnawrnEWipNEgMLNjgL7u/ka99ScT3KjmcAYCm+ss5wHTGnif\nC4FfAn0I7oLWUC3XANcADB48OIy3lvYiJTGeM7L6ckZWEAxFZZW8k7uTN9YV8MbaAl5cvQ2AYb2S\nOWdsf6YPT2f8oO4k64wkkTbTaNdQ6M5kP3D3lfXWZwM/cffPNPjCT/ebC8xy9y+Hlj8PTHX3GxrZ\n/2Tgx+5+RlPHVddQx+HufLKrjDfX7+SFD/JZlFvIAYdOBuMHdefkEb2ZPKQHYwam0VNnJIm0SHO7\nhobWDwEAd19iZkPDeN88gukoamQAWxvb2d3fNLPhZtbL3XeGcXw5ypkZQ9KT+Xx6Mp8/fghFZZW8\nv3k3Szft5j85O/ndq+trz0ga1LMrkwb3YFxGd8YOTGPswDS6dtHgs0hraCoIEpvYFs6k9ouBEaHZ\nSrcA84DL6+4Q6n7aEBosngR0AQrDOLZ0QGlJ8Zw6qg+njurDd2aNomhfJau3FPHBliKWfbKHRbmF\n/H158Fmicycja0AqU4b25Phh6Uwb1pPURF3wLtIcTQXBYjP7irs/UHelmV0NLD3cgd29ysyuJ7ir\nWRzwsLuvNrNrQ9vvBz4LXGlmlcA+gumtNWAsAKR1jWf6Mb2Yfkyv2nU7istZmVfEss27WbJpN39c\n9DEPvbWRuE7G5ME9OGlEL6Yfk864DJ2VJBKupsYI+gLPAvv59A9/NsGn9gvdfVubVFiPxgikrvLK\napZv3sN/1hfw+toCVm8tBiCpSxzZQ3ty/LCeTBrcg/EZ3dWVJDGtpVNMnArUXFm82t1fbeX6joiC\nQJqya+9+3s0NTlVduKGQnB3BGcpxnYyRfVOYNLg704alc/ywnvRJaar3U6RjafGNadoTBYEciV17\n97Psk90s37yH5Zv3sOyTPZRWVAEwul8KJx7Ti7ED05g0uAeD05OiXK1I5DTrrKGam9Yf5sCH3Uck\nmnomd+H0Y/ty+rHBdQxV1QdYk1/M2zmFvLmugD8t+piKqgNAcMrqeWP7M2lId7L666wkiR1NjRHs\nA9Y39Vogzd3b9AovtQikNVVWH2BDQSn/WbeTZ5dtYU1+MMYQ18k4tn8KEwf1YFxGGhMH99DVz3JU\na1bXkJkNCePY1e6e15LijpSCQCJpW1E5K/P2sCIv6EZasXkPe/dXAzCsdzLnjOnPlMyejM9Io3uS\nLnKTo4fGCESaqfqAk1tQyqLcQp7/YBvvbgyufgYYmp7EuIzujMtIY/yg7owZoO4kab8UBCKtpKS8\nkg+2FLF88x5Wbi5iRd4e8ovKgU/PTJpxTDqnH9uXCYM09ba0HwoCkQjaUVLOys1FrMzbw9JPdvPe\nxl1UVjtxnYzhvZMZ3S+VUf1SOKZPN4b3TiazVzfiOmmsQdpWi6ahNrNkYF/oJjUjgdHAC+5e2cp1\nihyV+qQkckZWYu0Mq6UVVSzM2cmqLUWs3lrM0o93s2DFp9NspSZ2ZvrwXkwa0p2RfVPIGpCqaxok\nqsK5oGwpcBLQA1gELAHK3P2KyJd3KLUI5GhUUl5JbsFe1u8o5b2NhbydU8iWPftqt/fqlsDYgamM\nzejOuIFpjMtIo0+qwkFaT0tvTGPuXhaaY+hud7/DzJa1bokiHVtKYjzjB3Vn/KDuXDw5A4Dde/ez\ndnsJa7YWs2prEau3FPPGuvW1g9G9unVhZN8UhvVOZkD3rmT0SGJQj64MTU+mh6blllYUVhCY2QnA\nFcDVR/A6EWlCj+QuHD8sneOHpdeuK9tfxZqtxazIK+Kj/GLW7Sjlnyvz2VN2cE9sr24JDOudTL/U\nRAZ078qx/VPI6p/K0F7JmmxPjlg4f9C/AXwfeDY0e+gw4LWIViUSo5K6dCZ7aE+yh/Y8aH1pRRVb\ndu9j864ycneWkrOjlE2FZazI28MLq/KprA6aEfFxxtD0ZDJ7JTMkPYm+qYn0SU1kcM8khqYn6doH\nadARnTVkZp2Abu5eHLmSmqYxApGD7a86QM6OUj7aVsz6HaGQ2LmXj3eVsT80fUaNvqkJjOqXyrBe\nQVjUPAZ076ozmTq4lp419L/AtUA1wXTUaWb2P+5+Z+uWKSLN0aVzJ7IGpJI1IPWg9e5O8b4q8ov3\nsXnXPjbuLOWjbSWs3VbC0k27aq+YBugS14mMnl0Z2L0rfVMTmTtpINMqFkJKf0jpC8l9IF6D1x1V\nOF1DWe5ebGZXAM8D3yMIBAWBSDtmZqQlxZOWFM/ofqlA39pt7k5BSQUbd+4ld+dePi4s4+PCveQX\nlbN++05OG9IZnv/8wQfs0i0IhrQM6JkJ6SOg+yDo1hd6ZEK33m37DUqrCScI4s0sHrgA+L27V5rZ\n0XUVmogcxMzoExo/mFZnsLpWdSUM+g+UbIOSrbB3Z/Ao2Qp7NsOqZ6B8z8GvSe4NvUYGoZDSDxLT\nILkXpA6A1IHBuoSUNvn+5MiEEwR/ADYBK4A3Q5PRRW2MQETaQFw89B8XPBriDmWFULwVSrfDzvWw\nYzUUboCcf8PeHeAHDn1d566Q1BOS0oOWRErfUEj0D4KiW9/ga3IfiNPJiW2lWVNMmFlnd6+KQD2H\npcFikaOAO+wvhdIdULwFivOhdFuwXLYLynYGz0u2BUFCA3+HuvYMWhQJqUHrInUAdB8cBEdanfBI\nSAVND35YLR0sTgN+ApwcWvUG8P+AolarUEQ6FrOgGyghBdKHN71vdWUQBqXboWT7p4FRuiMIjIoS\n2LcLtn0QtDTqi+sSdEslpQfB0a1f0NLo1i8IirSMIDy69YFOmgSwIeG0vR4GVgGXhJY/DzwCXBSp\nokQkhsTFB3+s0zIOv29leTBOUbQlNH6RD3sLgvGLstA4RsG6IEwO1Ou0sE5BYPTIhN4jg+6nLklB\nd1T3IUGLo1ufYFA8xloY4QTBcHf/bJ3l28xseTgHN7PZwF1AHPCgu99eb/sVBGchAZQC17n7inCO\nLSIxKD4Reg4LHk05cCBoRZTkB+MYRZs/DY7CXPjo+WCMo6EuKQw6J0DXHpA2KAiPzl2CgEjuHYRF\nSr+gayq596fdV0dxeIQTBPvMbIa7vwVgZicC+w7zGswsDrgHOBPIAxab2QJ3X1Nnt43AKe6+28zO\nBuYD0470mxAROUinTsEf6ORe0G9sw/u4Q1V5EA67Pw6+lu4IxjaqKoKxjD0fB4+qiqCLam8BePWh\nx+rUORjT6No9+JrSLwiMhJQgJLoPhh5Dg69J6e0uNMIJgmuBx0NjBQC7gS+E8bqpQI675wKY2RPA\n+UBtELj7wjr7LwLCaBuKiLQCM4jvGl4Lo0ZtS2NbaCyjIAiHfbuC4CjfE3zdvhpyX4OK0kODIy4B\nElODFkZCSjAQXnN9RuqAIES69gi2Jfdpk7GNwwZBqKtmvJmlhpaLzewbwMrDvHQgsLnOch5Nf9q/\nGnihoQ1mdg1wDcDgwYMPV7KISGTUbWkw5vD715w9tftj2L0p6KIq3hIExP7S4Gv5Hti8CFZvPXRc\nA8Digi6olL4w6UqY8uVW/qaOYBbRevMLfQv47WFe0lDbp8FzVc3sVIIgmNHIe88n6DYiOztbF7OJ\nyNGh5uypfmOCR1MOHAhaFyX5UFEM5UXBmVQ1p96WbINO8REps7lXbITTwZUHDKqznAFsrb+TmY0D\nHgTOdvfCZtYjInJ069Qp+NSf0vfw+7b2WzfzdeF8Kl8MjDCzTDPrAswDFtTdwcwGA88An3f3dc2s\nRUREWqDRFoGZldDouVV0PdyB3b3KzK4HXiI4ffTh0P0Mrg1tvx/4MZAO3GvBKHpVY1e+iYhIZDRr\niolo0hQTIiJHrqkpJnRPOxGRGKcgEBGJcQoCEZEYpyAQEYlxCgIRkRinIBARiXEKAhGRGKcgEBGJ\ncQoCEZEYpyAQEYlxCgIRkRinIBARiXEKAhGRGKcgEBGJcQoCEZEYpyAQEYlxCgIRkRinIBARiXEK\nAhGRGKcgEBGJcQoCEZEYpyAQEYlxEQ0CM5ttZmvNLMfMbm5g+2gze8fMKszsO5GsRUREGtY5Ugc2\nszjgHuBMIA9YbGYL3H1Nnd12ATcCF0SqDhERaVokWwRTgRx3z3X3/cATwPl1d3D3He6+GKiMYB0i\nItKESAbBQGBzneW80LojZmbXmNkSM1tSUFDQKsWJiEggkkFgDazz5hzI3ee7e7a7Z/fu3buFZYmI\nSF2RDII8YFCd5QxgawTfT0REmiGSQbAYGGFmmWbWBZgHLIjg+4mISDNE7Kwhd68ys+uBl4A44GF3\nX21m14a2329m/YAlQCpwwMy+AWS5e3Gk6hIRkYNFLAgA3P154Pl66+6v83wbQZeRiIhEia4sFhGJ\ncQoCEZEYpyAQEYlxCgIRkRinIBARiXEKAhGRGKcgEBGJcQoCEZEYpyAQEYlxCgIRkRinIBARiXEK\nAhGRGKcgEBGJcQoCEZEYpyAQEYlxCgIRkRinIBARiXEKAhGRGKcgEBGJcQoCEZEYpyAQEYlxCgIR\nkRgX0SAws9lmttbMcszs5ga2m5n9LrR9pZlNimQ9IiJyqIgFgZnFAfcAZwNZwGVmllVvt7OBEaHH\nNcB9kapHREQaFskWwVQgx91z3X0/8ARwfr19zgce98AioLuZ9Y9gTSIiUk/nCB57ILC5znIeMC2M\nfQYC+XV3MrNrCFoMAKVmtvYIa+kF7DzC17Q11dg6VGPrUI0t197qG9LYhkgGgTWwzpuxD+4+H5jf\n7ELMlrh7dnNf3xZUY+tQja1DNbZce6+vrkh2DeUBg+osZwBbm7GPiIhEUCSDYDEwwswyzawLMA9Y\nUG+fBcCVobOHjgeK3D2//oFERCRyItY15O5VZnY98BIQBzzs7qvN7NrQ9vuB54FzgBygDPhihMpp\ndrdSG1KNrUM1tg7V2HLtvb5a5n5Il7yIiMQQXVksIhLjFAQiIjGuwwfB4aa5iAYzG2Rmr5nZh2a2\n2sy+Hlrf08z+z8zWh772iHKdcWa2zMz+2U7r625mT5nZR6Gf5QntsMZvhv6NV5nZX8wsMdo1mtnD\nZrbDzFbVWddoTWb2/dDvz1ozmxXFGu8M/VuvNLNnzax7e6uxzrbvmJmbWa9o1hiuDh0EYU5zEQ1V\nwLfd/VjgeOC/QnXdDLzi7iOAV0LL0fR14MM6y+2tvruAF919NDCeoNZ2U6OZDQRuBLLdfQzBSRPz\n2kGNjwKz661rsKbQ/8t5wHGh19wb+r2KRo3/B4xx93HAOuD77bBGzGwQcCbwSZ110aoxLB06CAhv\nmos25+757v5+6HkJwR+wgQS1PRba7THggqgUCJhZBnAu8GCd1e2pvlTgZOAhAHff7+57aEc1hnQG\nuppZZyCJ4DqZqNbo7m8Cu+qtbqym84En3L3C3TcSnOE3NRo1uvvL7l4VWlxEcN1Ru6ox5DfAdzn4\n4tio1Biujh4EjU1h0W6Y2VBgIvAu0LfmOorQ1z5RLO23BP+ZD9RZ157qGwYUAI+Euq8eNLPk9lSj\nu28BfkXwyTCf4DqZl9tTjXU0VlN7/R36EvBC6Hm7qdHM5gBb3H1FvU3tpsaGdPQgCGsKi2gxs27A\n08A33L042vXUMLPzgB3uvjTatTShMzAJuM/dJwJ7iX5X1UFC/eznA5nAACDZzD4X3aqOWLv7HTKz\nHxJ0r/65ZlUDu7V5jWaWBPwQ+HFDmxtY127+FnX0IGi3U1iYWTxBCPzZ3Z8Jrd5eM/tq6OuOKJV3\nIjDHzDYRdKedZmZ/akf1QfBvm+fu74aWnyIIhvZU4xnARncvcPdK4BlgejursUZjNbWr3yEz+wJw\nHnCFf3oRVHupcThB6K8I/e5kAO+bWT/aT40N6uhBEM40F23OzIygb/tDd/+fOpsWAF8IPf8C8Pe2\nrg3A3b/v7hnuPpTgZ/aqu3+uvdQH4O7bgM1mNiq06nRgDe2oRoIuoePNLCn0b346wXhQe6qxRmM1\nLQDmmVmCmWUS3DvkvSjUh5nNBr4HzHH3sjqb2kWN7v6Bu/dx96Gh3508YFLo/2q7qLFR7t6hHwRT\nWKwDNgA/jHY9oZpmEDQLVwLLQ49zgHSCMzbWh772bAe1zgT+GXreruoDJgBLQj/H54Ae7bDG24CP\ngFXAH4GEaNcI/IVgzKKS4I/V1U3VRNDdsQFYC5wdxRpzCPrZa35n7m9vNdbbvgnoFc0aw31oigkR\nkRjX0buGRETkMBQEIiIxTkEgIhLjFAQiIjFOQSAiEuMUBCL1mFm1mS2v82i1K5bNbGhDs1WKRFPE\nblUpchTb5+4Tol2ESFtRi0AkTGa2ycz+28zeCz2OCa0fYmavhObJf8XMBofW9w3Nm78i9JgeOlSc\nmT0Quk/By2bWNWrflAgKApGGdK3XNXRpnW3F7j4V+D3BDK2Enj/uwTz5fwZ+F1r/O+ANdx9PMA/S\n6tD6EcA97n4csAf4bES/G5HD0JXFIvWYWam7d2tg/SbgNHfPDU0auM3d081sJ9Df3StD6/PdvZeZ\nFQAZ7l5R5xhDgf/z4AYwmNn3gHh3/1kbfGsiDVKLQOTIeCPPG9unIRV1nlejsTqJMgWByJG5tM7X\nd0LPFxLM0gpwBfBW6PkrwHVQe//n1LYqUuRI6JOIyKG6mtnyOssvunvNKaQJZvYuwYeoy0LrbgQe\nNrObCO6a9sXQ+q8D883saoJP/tcRzFYp0q5ojEAkTKExgmx33xntWkRak7qGRERinFoEIiIxTi0C\nEZEYpyAQEYlxCgIRkRinIBARiXEKAhGRGPf/AfiS0bC1YvhDAAAAAElFTkSuQmCC\n",
             "text/plain": [
               "\u003cFigure size 600x400 with 1 Axes\u003e"
             ]
@@ -678,25 +724,21 @@
         }
       ],
       "source": [
-        "NUM_EPOCHS = 10\n",
-        "BATCH_SIZE = 100\n",
-        "epochs = np.arange(1, NUM_EPOCHS + 1, 1)\n",
-        "losses = np.zeros([NUM_EPOCHS])\n",
-        "m = Model()\n",
-        "\n",
-        "for i in range(NUM_EPOCHS):\n",
-        "  for batch_idx in range(len(train_images) // BATCH_SIZE):\n",
-        "    batched_images = train_images[BATCH_SIZE*(batch_idx) : BATCH_SIZE * (batch_idx + 1)]\n",
-        "    batched_labels = train_labels[BATCH_SIZE*(batch_idx) : BATCH_SIZE * (batch_idx + 1)]\n",
-        "    result = train(\n",
-        "        x=tf.constant(batched_images, shape=(BATCH_SIZE, IMG_SIZE, IMG_SIZE),\n",
-        "                      dtype=tf.float32),\n",
-        "        y=tf.constant(batched_labels, shape=(BATCH_SIZE, 10), dtype=tf.float32))\n",
-        "  losses[i] = result['loss']\n",
-        "  print('Finished {0} epochs, current loss: {1}'.format(i + 1, losses[i]))\n",
-        "\n",
-        "plt.plot(epochs, losses)\n",
-        "plt.show()"
+        "plt.plot(epochs, losses, label='Pre-training')\n",
+        "plt.plot(more_epochs, more_losses, label='On device')\n",
+        "plt.ylim([0, max(plt.ylim())])\n",
+        "plt.xlabel('Epoch')\n",
+        "plt.ylabel('Loss [Cross Entropy]')\n",
+        "plt.legend();"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Jbe9_LjAbEMF"
+      },
+      "source": [
+        "Above you can see that the on-device training picks up exactly where the pretraining stopped."
       ]
     },
     {
@@ -714,8 +756,7 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "7c3d3cc5f171",
-        "outputId": "1ffacb33-b209-4a8f-e914-dcc5bfb4b628"
+        "id": "7c3d3cc5f171"
       },
       "outputs": [
         {
@@ -724,12 +765,14 @@
               "{'checkpoint_path': array(b'/tmp/model.ckpt', dtype=object)}"
             ]
           },
-          "execution_count": 42,
+          "execution_count": 36,
           "metadata": {},
           "output_type": "execute_result"
         }
       ],
       "source": [
+        "save = interpreter.get_signature_runner(\"save\")\n",
+        "\n",
         "save(checkpoint_path=np.array(\"/tmp/model.ckpt\", dtype=np.string_))"
       ]
     },
@@ -763,99 +806,64 @@
       "source": [
         "## Restore the trained weights\n",
         "\n",
-        "After you save a checkpoint file, you can restore it using the `restore` signature method. Loading this additional weighting data into your model allows you to potentially improve performance for individual users or create personalized models based on individual usage."
+        "Any time you create an interpreter from a TFLite model, the interpreter will initially load the original model weights.\n",
+        "\n",
+        "So after you've done some training and saved a checkpoint file, you'll need to run the `restore` signature method to load the checkpoint.\n",
+        "\n",
+        "A good rule is \"Anytime you create an Interpreter for a model, if the checkpoint exists, load it\". If you need to reset the model to the baseline behavior, just delete the checkpoint and create a fresh interpreter.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "5yIZoLveRZgp"
+      },
+      "outputs": [],
+      "source": [
+        "another_interpreter = tf.lite.Interpreter(model_content=tflite_model)\n",
+        "another_interpreter.allocate_tensors()\n",
+        "\n",
+        "infer = another_interpreter.get_signature_runner(\"infer\")\n",
+        "restore = another_interpreter.get_signature_runner(\"restore\")"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "5yIZoLveRZgp",
-        "outputId": "6f0283a2-ea28-41d1-bf91-3857d7d9097c"
+        "id": "fjiUbx7zIoLq"
       },
       "outputs": [
         {
           "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEWCAYAAABv+EDhAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90\nbGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsT\nAAALEwEAmpwYAAAbyElEQVR4nO3de3xV5Z3v8c/XoAUNooJSBDW0arWjGDBYLwwn9II6WlprOwdt\ni3ac0o6tt9NTRc/MGG079Rw71eNh1Np6waqk89JRexSVKkZGx7ZCRUHRwUuUoCMXLxCPqMDv/LEX\nzCYmZAf23ivh+b5fr7zIWvvZ6/k9SdjftZ619tqKCMzMLD075F2AmZnlwwFgZpYoB4CZWaIcAGZm\niXIAmJklygFgZpYoB4D1OZJC0v4ltm2SdEv2/b6S2iXVZMtDJc2VtEbSP6rgRklvSfpjJcdg1hs4\nAKxsshfXjV8bJL1XtPz1Lp7TKKmtGvVFxKsRURsR67NVU4GVwK4R8QNgHPAFYEREHFGNmipN0gRJ\nCyW9LWmVpDslDd9C+3pJ/yrpHUltkv6+6LGLOvyO38t+z0OqMxorNweAlU324lobEbXAq8AXi9bd\nmnd9ndgPeDb+892Q+wGtEfFuTzckqV9ZKyufZ4FjI2I3YG9gCXDNFtrfBswF9gD+C/A3kiYBRMQ/\ndPgd/0+gJSJWVnIAVjkOAKs4SR+TdKWk17KvK7N1uwD3AXsX7VXuLekISY9ne62vS5ouaacS+xop\n6ZFsWud3wJCix+qy6aN+km4CTgPOz/r9DvAr4Khs+ZLsOSdKWpDV8m+SRhVtr1XSBZKeBt7Ntntk\n1u5tSU9Jaixq3yLpR5Iey+qbXbz3LGlc0XOXSjq96Of3M0mvSnpD0rWSBpTy84iINyLitaJV64Et\nTZ/VAbdGxPqIeBF4FPizTn7OAr4JzCilDuulIsJf/ir7F9AKfD77/lLg98BewJ7AvwE/yh5rBNo6\nPPdw4EigH4UXpMXAuUWPB7B/F/0+Dvwc+BgwHlgD3JI9Vpc9t1+2fBPw46Lnng48WrQ8BlgOfAao\noRAYrcDHisa4ANgHGAAMB1YBf0Fh5+oL2fKeWfsW4EXgwKx9C3BZ9ti+Wa2nADsCg4H67LErgd9S\n2CsfCPxf4KdFdb4NjNvC72LfrM0G4EPg9C20/QfgsqyGTwFtwNhO2o0H2oHavP/W/LX1Xz4CsGr4\nOnBpRCyPiBXAJRT2HjsVEfMj4vcRsS4iWoFfUJiO2CJJ+wJjgb+LiPcjYi6FF8ut9W3gFxHxhyjs\nEc8A3qcQThtdFRFLI+I94BvArIiYFREbIuJ3wDwKgbDRjRHx71n7fwbqs/VfBx6MiJkR8WFErIqI\nBdme9reB8yLizYhYQ+FFevLGDUbEbhHxaFeDiMK5j90oHA39LfDcFsZ8D/BV4L2s3fUR8UQn7U4D\nbo+I9i1sy3q53jpvaduXvYFXipZfydZ1StKBFPbiG4CdKfydzi+xn7di8zn8VyjsoW+N/YDTJJ1V\ntG4nNq99aYf2X5P0xaJ1OwIPFy3/R9H3/w+ozb7fh8LRQUd7UvgZzC9kAQCicETSIxHxpqQZwFOS\nhkfEuuLHJe0B3A98n8K5gI8Dt0t6IyKuLmo3APga8KWe1mC9i48ArBpeo/DiuNG+2TooTMl0dA2F\nvc8DImJX4CIKL3rdeR3YPTu3UNzX1loK/CTbw974tXNEzCxqEx3a/7pD+10i4rIS+/pkJ+tXUtgb\n/7OibQ6KwknYrdGPwlTcrp089glgfUTcnB19tQHNbH4EA/AV4E0KU1jWhzkArBpmAn8rac/spOff\nA7dkj70BDJY0qKj9QGA10C7pIOBvSukkIl6hMOVyiaSdJI0DvtjN07bkl8B3JX0me4/ALpJOkDSw\ni/a3AF+UdKykGkn9s8tcR5TQ163A5yX9ZXYyebCk+ojYkNVxhaS9ACQNl3RsKQOQ9BVJn5K0g6Q9\nKRxZPRkRb3bS/N8LT9GpWfuPA/8VeKpDu9OAmyPC95Lv4xwAVg0/pvDC/DSwEPhTto6IeI5CQLyU\nXf2yN/DfgVMpnBT9JfCbHvR1KoWTtm8CFwM3b23RETGPwvz7dOAt4AUKJ4q7ar+UwrTIRcAKCnv1\nP6SE/2cR8SqFPe0fZLUvAA7LHr4g6/v3klYDD1I4QQtsev/Fn3ex6eEUpnXWUPjZbwBOKnrutZKu\nzWpYTWHv/rxsvAuARcBPitoPBz7LNvxcrfeQQ9zMLE0+AjAzS5QDwMwsUQ4AM7NEOQDMzBLVp94I\nNmTIkKirq6t4P++++y677LJL9w23Qx57emNPddyQztjnz5+/MiL27Li+TwVAXV0d8+bNq3g/LS0t\nNDY2Vryf3shjb8y7jKpLddyQztglvdLZek8BmZklygFgZpYoB4CZWaL61DmAznz44Ye0tbWxdu3a\nsm1z0KBBLF68uGzbq6b+/fszYsQIdtxxx7xLMbNers8HQFtbGwMHDqSuro6i2+VukzVr1jBwYFf3\n++q9IoJVq1bR1tbGyJEj8y7HzHq5Pj8FtHbtWgYPHly2F/++TBKDBw8u69GQmW2/+nwAAH7xL+Kf\nhZmVarsIADMz67k+fw6go7pp95Z1e62XndBtm5qaGg499FAigpqaGqZPn87RRx+9xedcddVVXHPN\nNYwZM4Zbb721XOWaWV/SNKj7NpvavlP27re7AMjDgAEDWLBgAQAPPPAAF154IY888sgWn3P11Vdz\n3333lXyydt26dfTr51+XmZWPp4DKbPXq1ey+++6bli+//HLGjh3LqFGjuPjiiwH47ne/y0svvcSk\nSZO44oorePPNN/nyl7/MqFGjOPLII3n66acBaGpqYurUqUycOJEpU6awYsUKTj75ZMaOHcvYsWN5\n7LHHchmjmW0fvEtZBu+99x719fWsXbuW119/nTlz5gAwe/ZslixZwh//+EcigkmTJjF37lyuvfZa\n7r//fh5++GGGDBnCWWedxejRo7nrrruYM2cOU6ZM2XREMX/+fB599FEGDBjAqaeeynnnnce4ceN4\n9dVXOfbYY/vs+xXMLH8OgDIongJ6/PHHmTJlCosWLWL27NnMnj2b0aNHA9De3s6SJUsYP378Zs9/\n9NFHueOOOwD47Gc/y6pVq3jnncJ836RJkxgwYAAADz74IM8+++ym561evbrPvmfBzPLnACizo446\nipUrV7JixQoiggsvvJDvfOc7W3xOZ5/LvPFyzuJb1W7YsIHHH398UyCYWe/TkwtRWvtXsJAS+BxA\nmT333HOsX7+ewYMHc+yxx3LDDTfQ3t4OwLJly1i+fPlHnjN+/PhNVwK1tLQwZMgQdt1114+0mzhx\nItOnT9+0vPGow8xsa1T8CEDSDcCJwPKIOCRb1wR8G1iRNbsoImaVo79SLtvsTk+nVTaeA4DC3vyM\nGTOoqalh4sSJLF68mKOOOgqA2tpabrnlFvbaa6/Nnt/U1MS3vvUtRo0axc4778yMGTM67eeqq67i\ne9/7HqNGjWLdunWMHz+ea6+9dusGaWbJq8YU0E3AdODmDuuviIifVaH/ilu/fn2Xj51zzjmcc845\nH1nf2tq66fs99tiDu++++yNtmpqaNlseMmQIv/nNb7a6TjOzYhWfAoqIucCble7HzMx6Rp2dgCx7\nJ1IdcE+HKaDTgdXAPOAHEfFWF8+dCkwFGDp06OHNzc2bPT5o0CD233//sta7fv16ampqyrrNanrh\nhRc2XUXUU+3t7dTW1pa5or4h1bGnOm6ozNgXLiv9/96hO7xc+oaH1fe8mMyECRPmR0RDx/V5BcBQ\nYCUQwI+AYRHxV91tp6GhITp+JvDixYs5+OCDy1pvX7+0clt+Jql8RmpnUh17quOGyoy9Z1cBnVr6\nhrfhVhCSOg2AXK4Ciog3ImJ9RGwAfgkckUcdZmYpyyUAJA0rWjwJWJRHHWZmKavGZaAzgUZgiKQ2\n4GKgUVI9hSmgVmDL75QyM7Oyq3gARMQpnay+vmId9uT2ql3YbPa/xHm3O++8k6985SssXryYgw46\niBUrVnDiiSfywQcfcNVVV7Fw4ULOPPPMba7NzKxc/E7gMpk5cybjxo1j41VKDz30EAcddBBPPvkk\n++yzD1dffXWPthcRbNiwoRKlmpkBDoCyaG9v57HHHuP666+nubmZBQsWcP755zNr1izq6+u54IIL\nePHFF6mvr+eHP/wh0PltoltbWzn44IM588wzGTNmDEuXLs1zWGa2nfPN4Mrgrrvu4rjjjuPAAw9k\njz32YMOGDVx66aXMmzeP6dOn09rayjPPPLPp3j1d3SZ633335fnnn+fGG2/s8RGDmVlP+QigDGbO\nnMnkyZMBmDx5MjNnztxi++LbRI8ZM4bnnnuOJUuWALDffvtx5JFHVrxmMzMfAWyjVatWMWfOHBYt\nWoQk1q9fjyQuueSSLp/T1W2iW1tbN7v9s5lZJfkIYBvdfvvtTJkyhVdeeYXW1laWLl3KyJEjaWtr\n29Rm4MCBrFmzZtNyqbeJNjOrpO3vCGAb3i69UU9uBTFz5kymTZu22bqTTz6ZCy64gDPOOAOAwYMH\nc8wxx3DIIYdw/PHHc/nll3d6m+i+fP8hM+t7tr8AqLKWlpaPrDv77LM5++yzN1t32223bbbc1W2i\nFy3ym6LNrDo8BWRmligHgJlZoraLAKjGLa37Cv8szKxUfT4A+vfvz6pVq/zCR+HFf9WqVfTv3z/v\nUsysD+jzJ4FHjBhBW1sbK1as6L5xidauXdtnX0T79+/PiBEj8i7DzPqAPh8AO+64IyNHjizrNlta\nWhg9enRZt2lm1tv0+SkgMzPbOg4AM7NEOQDMzBLlADAzS5QDwMwsUQ4AM7NEOQDMzBLlADAzS5QD\nwMwsUQ4AM7NEOQDMzBLlADAzS5QDwMwsUQ4AM7NEOQDMzBLlADAzS5QDwMwsURUPAEk3SFouaVHR\nuj0k/U7Skuzf3Stdh5mZba4aRwA3Acd1WDcNeCgiDgAeypbNzKyKKh4AETEXeLPD6i8BM7LvZwBf\nrnQdZma2OUVE5TuR6oB7IuKQbPntiNit6PG3IqLTaSBJU4GpAEOHDj28ubm54vW2t7dTW1tb8X56\nI489vbGnOm6ozNgXLnun5LaH7vBy6RseVt/zYjITJkyYHxENHdf32+otVklEXAdcB9DQ0BCNjY0V\n77OlpYVq9NMbeeyNeZdRdamOGyoz9tOn3Vty29b+F5e+4VNKD5ZS5XUV0BuShgFk/y7PqQ4zs2Tl\nFQC/BU7Lvj8NuDunOszMklWNy0BnAo8Dn5LUJukM4DLgC5KWAF/Ils3MrIoqfg4gIk7p4qHPVbpv\nMzPrmt8JbGaWKAeAmVmiHABmZolyAJiZJcoBYGaWKAeAmVmiev2tIMxsO9c0qAdty387hJQ5AMys\n7Op6dD+cChZiW+QpIDOzRPkIwKw38DSI5cBHAGZmiXIAmJklygFgZpYoB4CZWaIcAGZmifJVQGYV\n4mvhrbfzEYCZWaIcAGZmiXIAmJklygFgZpYoB4CZWaIcAGZmiXIAmJklygFgZpYoB4CZWaIcAGZm\niXIAmJklygFgZpYoB4CZWaIcAGZmiXIAmJklKtfPA5DUCqwB1gPrIqIhz3rMzFLSGz4QZkJErMy7\nCDOz1JQ0BSTpmFLWmZlZ36GI6L6R9KeIGNPduh53Lr0MvAUE8IuIuK6TNlOBqQBDhw49vLm5eVu6\nLEl7ezu1tbUV76c38tjLN/aFy94pue2hO7xc+oaH1fe8mC2oxO/cYy9NtcY+YcKE+Z1NsW8xACQd\nBRwNnAtcUfTQrsBJEXHYVldU2P7eEfGapL2A3wFnRcTcrto3NDTEvHnztqXLkrS0tNDY2Fjxfnoj\nj72xbNvr2WcCn1r6hptKf4EpRSV+5x57aao1dkmdBkB3U0A7AbUUzhUMLPpaDXx1q6vJRMRr2b/L\ngTuBI7Z1m2ZmVpotngSOiEeARyTdFBGvlLNjSbsAO0TEmuz7icCl5ezDzMy6tsUAkHRlRJwLTJf0\nkbmiiJi0DX0PBe6UtLGO2yLi/m3YnpmZ9UB3l4H+Ovv3Z+XuOCJeArbpHIKZmW297qaA5mf/PlKd\ncszMrFpKeiOYpIUULtUs9g4wD/hxRKwqd2FmZlZZpb4T+D4Kt2u4LVueDIhCCNwEfLHslZmZWUWV\nGgDHRETxO38XSnosIo6R9I1KFGZmZpVV6t1AayV9ZuOCpCMovD8AYF3ZqzIzs4or9Qjgr4EbJNVS\nmPpZDZyRXb//00oVZ2ZmlVNSAETEE8ChkgZRuH3E20UP/3MlCjMzs8oq9W6ggyT9HHgIeFDSP2Zh\nYGZmfVSp5wBuoPDBLX+Zfa0GbqxUUWZmVnmlngP4ZEScXLR8iaQFFajHzMyqpNQjgPckjdu4kH0Y\nzHuVKcnMzKqh1COA7wI3F837vwWcVpmSzMysGkq9Cugp4DBJu2bLqyWdCzxdwdrMzKyCSp0CAgov\n/BGxOlv8bxWox8zMqqRHAdCBylaFmZlV3bYEQPefJm9mZr1Wd58ItobOX+gFDKhIRWZmVhXdfSDM\nwGoVYmZm1bUtU0BmZtaHOQDMzBJV6hvBzMz6hLpp95bc9qbjdqlgJb2fjwDMzBLlADAzS5QDwMws\nUQ4AM7NEOQDMzBLlADAzS5QDwMwsUX4fgJml6/UF0PSl0to2vVPRUvLgIwAzs0Q5AMzMEpVrAEg6\nTtLzkl6QNC3PWszMUpNbAEiqAf4JOB74NHCKpE/nVY+ZWWryPAI4AnghIl6KiA+AZqDEszFmZrat\nFJHPJztK+ipwXET8dbb8TeAzEfH9Du2mAlMBhg4denhzc/NW9bdwWeln8EcOeJ/a918rrfGw+rL3\nf+gOL5fctpT+8xx7nuPuaf+V+L3nJdVx91R7ezu1tbV5l1FxEyZMmB8RDR3X53kZaGcfKv+RNIqI\n64DrABoaGqKxsXGrOju9J7eIPexlGp+/uLTGp5T2H60n/bf2L7HvEvvPc+x5jrun/Vfi956XVMfd\nUy0tLWzta8r2IM8poDZgn6LlEUCJuyFmZrat8gyAJ4ADJI2UtBMwGfhtjvWYmSUltymgiFgn6fvA\nA0ANcENEPJNXPWZmqcn1VhARMQuYlWcNZmap8juBzcwS5QAwM0uU7wbamWH1293lbmZmHTkAzLZD\nrZedUHLblpYW7/AkKpkA6PF/CDOz7ZzPAZiZJcoBYGaWKAeAmVmiHABmZolyAJiZJcoBYGaWKAeA\nmVmiHABmZolyAJiZJcoBYGaWKAeAmVmiHABmZolyAJiZJcoBYGaWKAeAmVmiHABmZolyAJiZJcoB\nYGaWKAeAmVmiHABmZolyAJiZJcoBYGaWKAeAmVmiHABmZolyAJiZJcoBYGaWqFwCQFKTpGWSFmRf\nf5FHHWZmKeuXY99XRMTPcuzfzCxpngIyM0uUIqL6nUpNwOnAamAe8IOIeKuLtlOBqQBDhw49vLm5\nueL1tbe3U1tbW9ZtLlz2TsltD93h5dI3PKy+rH2PHPA+te+/lkvf5R53T/sv99j7ikr8vfcVqYx9\nwoQJ8yOioeP6igWApAeBj3fy0P8Afg+sBAL4ETAsIv6qu202NDTEvHnzylpnZ1paWmhsbCzrNuum\n3Vty29b+p5a+4abuX+B60vdNhy2h8fmLc+m73OPuaf/lHntfUYm/974ilbFL6jQAKnYOICI+X0o7\nSb8E7qlUHWZm1rm8rgIaVrR4ErAojzrMzFKW11VA/0tSPYUpoFbgOznVYWaWrFwCICK+mUe/Zmb2\nn3wZqJlZohwAZmaJcgCYmSXKAWBmligHgJlZovK8GZxZxbVedkLJbVtmXlm5Qsx6IQdAAvwiaGad\ncQDY5obVwynbz31ueiTlsVuSfA7AzCxRDgAzs0Q5AMzMEuUAMDNLlAPAzCxRDgAzs0Q5AMzMEuUA\nMDNLlN8IZhXVk3ch01SxMsysEw6AKvELoZn1Np4CMjNLlAPAzCxRngKy3qPJN2IzqyYfAZiZJcpH\nAL2R94TNrAp8BGBmligHgJlZohwAZmaJcgCYmSXKAWBmligHgJlZohwAZmaJcgCYmSXKAWBmlihF\nRN41lEzSCuCVKnQ1BFhZhX56I489PamOG9IZ+34RsWfHlX0qAKpF0ryIaMi7jjx47OmNPdVxQ9pj\nB08BmZklywFgZpYoB0Dnrsu7gBx57OlJddyQ9th9DsDMLFU+AjAzS5QDwMwsUQ6ADiQdJ+l5SS9I\nmpZ3PdUgaR9JD0taLOkZSefkXVO1SaqR9KSke/KupZok7SbpdknPZb//o/KuqVoknZf9vS+SNFNS\n/7xrqjYHQBFJNcA/AccDnwZOkfTpfKuqinXADyLiYOBI4HuJjLvYOcDivIvIwf8G7o+Ig4DDSORn\nIGk4cDbQEBGHADXA5Hyrqj4HwOaOAF6IiJci4gOgGfhSzjVVXES8HhF/yr5fQ+FFYHi+VVWPpBHA\nCcCv8q6lmiTtCowHrgeIiA8i4u1ci6qufsAASf2AnYHXcq6n6hwAmxsOLC1abiOhF0IASXXAaOAP\nOZdSTVcC5wMbcq6j2j4BrABuzKa/fiVpl7yLqoaIWAb8DHgVeB14JyJm51tV9TkANqdO1iVznayk\nWuAO4NyIWJ13PdUg6URgeUTMz7uWHPQDxgDXRMRo4F0glfNeu1M4uh8J7A3sIukb+VZVfQ6AzbUB\n+xQtjyCRw0JJO1J48b81Iv4l73qq6BhgkqRWClN+n5V0S74lVU0b0BYRG4/2bqcQCCn4PPByRKyI\niA+BfwGOzrmmqnMAbO4J4ABJIyXtROGk0G9zrqniJInCPPDiiPh53vVUU0RcGBEjIqKOwu97TkQk\nsScYEf8BLJX0qWzV54Bncyypml4FjpS0c/b3/zkSOQFerF/eBfQmEbFO0veBByhcFXBDRDyTc1nV\ncAzwTWChpAXZuosiYlZ+JVmVnAXcmu3wvAR8K+d6qiIi/iDpduBPFK6Ce5IEbwvhW0GYmSXKU0Bm\nZolyAJiZJcoBYGaWKAeAmVmiHABmZolyAFjyJH1cUrOkFyU9K2mWpAMlLSpjHzdJ+mr2fUt2x9mn\ns7twTpe0W7n6MiuVA8CSlr0J6E6gJSI+GRGfBi4Chla4669HxChgFPA+cHeF+zP7CAeApW4C8GFE\nXLtxRUQsoOimgJLqJP2rpD9lX0dn64dJmitpQXZP+T/PPlfgpmx5oaTzttR5dtfZ84F9JR1WkRGa\ndcHvBLbUHQJ0dyO45cAXImKtpAOAmUADcCrwQET8JPssiZ2BemB4do95SpnaiYj1kp4CDgKe2tqB\nmPWUA8CsezsC0yXVA+uBA7P1TwA3ZDfSuysiFkh6CfiEpP8D3AuUeovhzu5Ea1ZRngKy1D0DHN5N\nm/OANyh8YlYDsBNARMyl8IEqy4BfS5oSEW9l7VqA71HCh8xkRw+HkuDNyCxfDgBL3RzgY5K+vXGF\npLHAfkVtBgGvR8QGCjfNq8na7UfhswR+SeFuqmMkDQF2iIg7gL+jm9srZ0cPPwWWRsTT5RuWWfc8\nBWRJi4iQdBJwpaRpwFqgFTi3qNnVwB2SvgY8TOGDUwAagR9K+hBoB6ZQ+AS5GyVt3Lm6sIuub5X0\nPvAx4EES+OhR6318N1Azs0R5CsjMLFEOADOzRDkAzMwS5QAwM0uUA8DMLFEOADOzRDkAzMwS9f8B\nX3BIcuP5vEYAAAAASUVORK5CYII=\n",
             "text/plain": [
-              "{'dense_1/bias:0': array([-8.61379430e-02, -1.36635631e-01,  4.30409200e-02, -2.94062406e-01,\n",
-              "        -4.39533353e-01, -1.61162630e-01,  2.06874669e-01,  4.19755787e-01,\n",
-              "         2.48541296e-01,  3.40482622e-01, -4.23337668e-02,  1.23906182e-02,\n",
-              "         2.73789577e-02, -1.56342611e-01, -3.35422635e-01, -5.26740849e-02,\n",
-              "        -1.74929649e-01,  2.87300557e-01,  3.72284725e-02, -4.43669409e-02,\n",
-              "        -2.76613057e-01, -4.61370379e-01,  2.70024896e-01,  9.93161872e-02,\n",
-              "         3.17382663e-02, -3.32814693e-01, -4.42167781e-02, -3.04149359e-01,\n",
-              "         6.87476844e-02,  2.91295443e-02, -1.16239423e-02,  3.40433791e-02,\n",
-              "         3.79291270e-03, -2.01377213e-01, -1.31028414e-01,  1.10448591e-01,\n",
-              "        -2.25179851e-01,  4.11831915e-01, -3.04769337e-01, -2.04285771e-01,\n",
-              "         1.66496366e-01,  3.11328828e-01,  3.05860877e-01, -9.03359428e-02,\n",
-              "         9.87995192e-02, -1.55597150e-01,  3.65341395e-01,  1.36061460e-01,\n",
-              "         3.50002199e-02, -3.86255354e-01,  1.56224608e-01,  2.81186610e-01,\n",
-              "         3.40695567e-02,  1.46951646e-01,  2.53533721e-01,  1.76614806e-01,\n",
-              "         1.12886965e-01, -1.37571767e-01, -1.80490255e-01,  5.99579848e-02,\n",
-              "         1.64817944e-01,  5.65163195e-01, -2.80971918e-02,  4.51668948e-01,\n",
-              "         8.79437178e-02,  3.39676052e-01,  3.48729901e-02,  3.93904805e-01,\n",
-              "         4.83267307e-02, -3.31148326e-01,  9.71089453e-02, -1.01981498e-02,\n",
-              "         2.34710976e-01, -4.14112695e-02,  3.60449223e-04, -1.38507308e-02,\n",
-              "        -1.39980972e-01,  5.05261915e-03, -1.95554510e-01, -1.26979128e-02,\n",
-              "         4.75965291e-02,  3.24146822e-02, -1.96471944e-01, -5.58585487e-02,\n",
-              "         3.30154486e-02,  1.97923154e-01, -4.89876628e-01, -2.66257346e-01,\n",
-              "         7.51771331e-02,  6.97124191e-03, -2.41389396e-04, -1.34414747e-01,\n",
-              "        -1.16782216e-02, -1.99998617e-02, -7.33832195e-02, -3.17755818e-01,\n",
-              "         4.43391174e-01,  4.61285084e-01, -1.14823796e-01, -8.33843499e-02,\n",
-              "        -1.03156138e-02,  9.37729031e-02,  1.71376958e-01,  1.09402396e-01,\n",
-              "        -9.80031118e-03,  2.52430409e-01,  1.39142230e-01,  3.12260568e-01,\n",
-              "         5.60285412e-02, -7.78009742e-02, -4.20839638e-01,  1.46650046e-01,\n",
-              "         5.02209105e-02, -1.65449344e-02, -2.31026947e-01,  4.76942025e-02,\n",
-              "        -9.91276652e-03,  2.14476556e-01, -1.17310226e-01,  3.25387083e-02,\n",
-              "         2.03948960e-01,  1.37504786e-01,  9.40428376e-02, -1.14569580e-02,\n",
-              "         5.81016913e-02, -3.54794553e-03, -1.62604805e-02,  8.76032636e-02],\n",
-              "       dtype=float32),\n",
-              " 'dense_1/kernel:0': array([[ 0.0209299 , -0.06344204,  0.00277872, ..., -0.00021458,\n",
-              "         -0.06049109,  0.03729003],\n",
-              "        [ 0.03705242,  0.04696848,  0.06514321, ...,  0.05242993,\n",
-              "          0.01620064,  0.02033134],\n",
-              "        [ 0.07786088, -0.03025506,  0.07236017, ..., -0.00665735,\n",
-              "          0.02599101, -0.04839775],\n",
-              "        ...,\n",
-              "        [-0.00464657,  0.0266066 ,  0.02190115, ..., -0.07283289,\n",
-              "         -0.06099217, -0.012019  ],\n",
-              "        [ 0.02377901,  0.01525712, -0.01274261, ...,  0.07480299,\n",
-              "          0.05886368, -0.03621945],\n",
-              "        [ 0.01591767, -0.03645648,  0.03281464, ..., -0.04087327,\n",
-              "         -0.06164488,  0.0298573 ]], dtype=float32),\n",
-              " 'dense_2/bias:0': array([ 0.17037725, -0.22025874, -0.05210603,  0.33427325, -0.3439995 ,\n",
-              "         0.5915224 , -0.01238881,  0.16692373, -0.05103152, -0.5832951 ],\n",
-              "       dtype=float32),\n",
-              " 'dense_2/kernel:0': array([[-0.58560854,  0.22957753, -0.2009533 , ..., -0.7760691 ,\n",
-              "          0.11828277, -0.29097608],\n",
-              "        [ 0.0285041 , -0.17422798, -0.03855472, ..., -0.91597855,\n",
-              "         -0.13760991,  0.4436695 ],\n",
-              "        [ 0.25559434,  0.03508702, -0.10079396, ..., -0.17506872,\n",
-              "         -0.035602  , -0.07269247],\n",
-              "        ...,\n",
-              "        [-0.13631049, -0.13071294, -0.12449102, ...,  0.15115076,\n",
-              "          0.17886223, -0.04123615],\n",
-              "        [-0.19577467,  0.14304623,  0.12057992, ..., -0.10881419,\n",
-              "          0.11445075, -0.19772694],\n",
-              "        [-0.8045735 , -0.00519833, -0.01640905, ..., -0.05033771,\n",
-              "         -0.14952046, -0.17301974]], dtype=float32)}"
+              "\u003cFigure size 600x400 with 1 Axes\u003e"
             ]
           },
-          "execution_count": 44,
           "metadata": {},
-          "output_type": "execute_result"
+          "output_type": "display_data"
         }
       ],
       "source": [
-        "another_interpreter = tf.lite.Interpreter(model_content=tflite_model)\n",
-        "\n",
-        "train = another_interpreter.get_signature_runner(\"train\")\n",
-        "infer = another_interpreter.get_signature_runner(\"infer\")\n",
-        "save = another_interpreter.get_signature_runner(\"save\")\n",
-        "restore = another_interpreter.get_signature_runner(\"restore\")\n",
+        "logits_before = infer(x=train_images[:1])['logits'][0]\n",
         "\n",
         "# Restore the trained weights from /tmp/model.ckpt\n",
-        "restore(checkpoint_path=np.array(\"/tmp/model.ckpt\", dtype=np.string_))"
+        "restore(checkpoint_path=np.array(\"/tmp/model.ckpt\", dtype=np.string_))\n",
+        "\n",
+        "logits_after = infer(x=train_images[:1])['logits'][0]\n",
+        "\n",
+        "compare_logits({'Before': logits_before, 'After': logits_after})"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "X7T6pja2bPqV"
+      },
+      "source": [
+        "The checkpoint was generated by training and saving with TFLite. Above you can see that applying the checkpoint updates the behavior of the model."
       ]
     },
     {
@@ -910,10 +918,33 @@
       },
       "outputs": [],
       "source": [
-        "infer = interpreter.get_signature_runner(\"infer\")\n",
-        "result = infer(\n",
-        "    x=tf.constant(test_images, shape=(len(test_images), IMG_SIZE, IMG_SIZE), dtype=tf.float32))\n",
-        "result_labels = np.argmax(result[\"output\"], axis=1)"
+        "infer = another_interpreter.get_signature_runner(\"infer\")\n",
+        "result = infer(x=test_images)\n",
+        "predictions = np.argmax(result[\"output\"], axis=1)\n",
+        "\n",
+        "true_labels = np.argmax(test_labels, axis=1)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "x6nHopKlAD6-"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "(10000, 10)"
+            ]
+          },
+          "execution_count": 40,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "result['output'].shape"
       ]
     },
     {
@@ -929,13 +960,12 @@
       "cell_type": "code",
       "execution_count": null,
       "metadata": {
-        "id": "GHbRasdfasd4",
-        "outputId": "0a44feff-ecfd-4e49-8dbf-6686954595be"
+        "id": "GHbRasdfasd4"
       },
       "outputs": [
         {
           "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAj0AAAI8CAYAAAAazRqkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90\nbGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsT\nAAALEwEAmpwYAACijklEQVR4nO2dedxd0/X/P0tMMSUigyQyCCLIRBIkRcxTDVVaFDVUS1XVUD9t\ntbR8S0v1q/QbOilVaoyxUmqKhEgkZEbIJIYkQhIzEfv3x73PzmevPOe4z5NnPp/365VX1r1n33PP\nPfvsc/azPmutbSEECCGEEEK0dNZq7AMQQgghhGgINOkRQgghRCHQpEcIIYQQhUCTHiGEEEIUAk16\nhBBCCFEINOkRQgghRCFYuyaN27dvH3r27FlPhyKqY968eViyZInV9X6bSl9+8cUX0f7www+jvfHG\nG9dqfx999FG011pr1Zx+/fXXr9X+6ppJkyYtCSF0qOv9NpX+fP/996O9aNGiaG+wwQZJuxUrVkR7\nvfXWizZfDwCwcuXKar/ns88+S15vtdVWNT/YNaSljc3PP/88ef32229Hu1WrVtHmceXhdnlwqZS1\n104fQzz2zer89GZSH2OzqYxLhu+zfrz511lwu3XWWSfaG2200RoeXd2Q15c1mvT07NkTEydOrJuj\nEhUxePDgetlvU+lLfkhOmDAh2vvss0+t9vf8889Hmwdg7969a7W/usbM5tfHfhuyP/mB5R9Kjz32\nWLSvueaaaA8cODBpt3DhwmhvvfXW0f7ggw+SdkuXLo02Pxznzp2btLvnnnsqOfQ6paWNTZ7kAMCf\n/vSnaLdt2zbarVu3ztxHmzZtou2vDZ7A8qS1Y8eOSbs999wz2uuuu27+Qdch9TE2K+1LnkT4SWVW\nLb3aTgjHjRsXbf4jEUj7JesPDgD49NNPo92hw6q5xR577FGrY6pr8vpS8pYQQgghCkGNPD1CVMIn\nn3ySvL766quj/a9//SvZxn/J81+a/q9JbpcHy1hsexc6/0Xy3e9+N9oHHnhgRd9TZPI8PRdffHG0\nn3766Wjff//9mfvbZJNNou3/8mTJha+Jjz/+OGn34IMPRvuQQw7J/C6RzZ133pm8/p//+Z9ob7rp\nptHu3Llz0o69bl27do22966++OKL0eaxue+++ybtWBY94YQTKjr25g6Po7xVEvK8O+w1f/zxx5Nt\n7AEfNWpUtLfddtvM/bPX9Z133knabbbZZtHm+/2vf/3rpN2hhx4a7cMOOyza3bt3z/gV9Y88PUII\nIYQoBJr0CCGEEKIQaNIjhBBCiEKgmB5RJ1xwwQXR/vOf/5xse++996LtU5c5ToPjBnzMxoYbbhht\nzirgdGe/P9bGOdsAAP79739Hm+NNhg4dmrR76qmnIFLyUpanTJkSbe5PzvAA0rRZ7s927dol7Tgd\nlvvz1VdfTdq99NJL0VZMT+3w2Vucap3X55tvvnm0uS99HMjy5cujzXFcb7zxRtKuT58+lR1wCyIv\npicrjsffZ19++eVo+9RzPqdHH310tCdPnpy04/spx9P52B8uK8D3Zn8NzZ+/KonqnHPOqfYzAPCb\n3/wm2l26dEF9Ik+PEEIIIQqBJj1CCCGEKASSt0StYffqFVdcEW12dwOpK9O7atmVy1V6fQVlfs37\n8G53X1U2a39cuJCryHKaNZCmXD7wwAPV7lusgtNc27dvH22WOIHU/Z5XkZn356VMZsGCBTU/WJHg\n5SiWJGfPnh1tL0FyqjTLHsuWLUva8Vjn68GP4X79+tXgqFsGeWUgmBEjRkT73XffTbZtueWW0WZZ\nGEhlRy4GOXz48KTdyJEjo833cV8kkvuM+4vT4QFgm222iTYXrmTZCwB+/vOfR/uGG25AfSJPjxBC\nCCEKgSY9QgghhCgEkrdErfnFL34Rbc7GyFtzh9dc8vD6Pl6O4orKLHn46s9cKZS/11dk5mwudi13\n6tQpacfZW0uWLEm2sXxTVLh6rofPeZ7LniVJ75Zn6ZH3wdcbACxevPjLD1bk0qNHj+Q1Z+JxP/hF\nRVm+ZhnEr93EcglXWM/LNCoKefIWS7ds9+rVK2nn161juI94zPqFevn1K6+8Em0vae6yyy7R5nuk\nz7zi+zNXW/cV9/m5cPPNNyfbuCp3pTJgHvL0CCGEEKIQaNIjhBBCiEKgSY8QQgghCoFiekSt4Qqr\nnE7sK4qyXvv9738/2XbaaadFe6eddoq2r9j5+uuvR5vTYn0cAuvVfEz8eSBdDZrbcfotkFaGnjNn\nTrJNMT3A9OnTM7dxfIevsM1xIRz74+M7+FrKSnMHVo+3EjXHx0hwKjKPRz++OZ09L1bHr7pehY8r\n8fF3RSCv4jVXH+dx48tzcBkOX4Ge46u4nS8rcNBBB0V77Nix0fYxOPzdbPtyJVx5ne+tn332WdKO\nx/MLL7yQbOOYntrG8TDy9AghhBCiEGjSI4QQQohCUDw/oqgz2IXKKebe/c1cfvnlyWuu0snucE5v\nBIA999wz2k888UTm/rfbbrto8yKUviLwH/7wh2hz6r1fGJPdwuzuBYCdd9458ziKAqc1A6mkxdeE\n709OZWWZlEsOANkLMXr3vZdDRc3xEku3bt2ivf3220fbSwx33nlntLlK8IwZM5J2e+yxR7QHDRoU\nbZaagVT68AsUFxE+jzym/Bjg8eHPG99bWSLz98XOnTtHe//996/2M/711ltvXe0xAGloA8tgvtQI\nM2HChMxtdYE8PUIIIYQoBJr0CCGEEKIQSN6qASx1sCs4L6LcuyA5Sp0rXvLCbE0VH3HP8Dnwv5n5\n9re/nby+7777qm3HWSBAKmlddNFF0faVeW+77bZos6vdL3B39NFHR5vlLV9FljNJJk+eXO2xFpnn\nnnsuec3jgiUtn5HDkhZn7flzvOmmm0abx46Xy1iKEbWDpWEAeOyxx6rd5jPndthhh2iz5Pu9730v\nade9e/dob7HFFtHmPgZWzxQqOpx5yve7vPusryzP4yWvAjpLaZy95+/HXHn5zTffjLbPBuNsWs7s\n8tInL5bqJW5+7viFT2uDPD1CCCGEKASa9AghhBCiEGjSI4QQQohC0KJjejh9jm2fmvnGG29Ee9y4\ncdHm6pRA7dJivf7NjBw5MtoXXHBBjffd0LB26+Fz6qvvMr4ychacBuvhCp1e/+eYnAEDBkT7rbfe\nStpxVdJK4RgsUeLFF19MXnOMAF8TfgVoTo199tlno+3j4zjVlm1fjdavAi1qjo+T4vsdpx77GByG\n+8XHnHD/ceq1j/fidOa8+2dLheNgPDyOfPxM//79o+1jdXysYhU+FZ3PN+/fx3Py83TFihXR9vdZ\n3h/vwx874yt5T506NdqDBw/O/FylyNMjhBBCiEKgSY8QQgghCkGLlreYvAXdxowZE+3x48dH28s5\nZ511Vo2/d/Hixcnrhx9+ONq8cGZz4O23366onZce2NXqz6l3ZVYxfPjwzP0fcMAB0Z47d26yjWWO\nUaNGRZsrOgOp9MVSlz8edv+yi1+U4NRzID1fefLW17/+9Yr2z9dSXnXevHIKojK8fM9yF/elH8Pc\nRwMHDoy2lypZ9ub+8tKLl2aKhl/YmO9PLP3xYp5Aer65XAeQSlB51ZCzKjf7vuTnGm/z++bv5evJ\nhxewFOrlTr7HS94SQgghhKgQTXqEEEIIUQg06RFCCCFEIWjRMT2sFbNO6Evnc9otl+/2KcpHHHFE\ntDlt0+uYPXr0iPY777yTbONVbf3qwk0dTu335K2szrEYPi6GYwV4Hy+//HLSjlP6vebNZK2y/tpr\nryXtRowYEW1OmfbpuJxymff7i4pPr620rMOxxx5b7fs+RZljE9q3b5+5P59uLWqOL//AYzOvxANv\n23HHHTPbcR/xd/k+L3pMz4IFC5LXnN6fFQMJpEvt9OzZM9nGyzfwc9HHX3KcKfeDX8aHj4OfrXys\n/rv43u/vE/xdvv/9s2BNkadHCCGEEIVAkx4hhBBCFIIWJW951x+73Ti976677krasXuVpar3338/\naZdV4dlLO7yCLK8mDKTySVaVzKZKXso6pzd6lym/9m7yn/3sZ9W2e+SRR5J2U6ZMiTafX5YLgVTS\nYkmMV1UHsldM99cQp2Ny+qUo4atvs3s87/rea6+9qn1/6NChyWuukO6vK8avzCxqjq/OyzIDjwOf\nvpwlfXm5jMcPyxv+e/3rouFLArDMmLfKOj+vfIkWHjt592r+HO/f3xd5nPMK7F7e4vsDH7t/lrRp\n0ybzu/jeXxfI0yOEEEKIQqBJjxBCCCEKQaPKWywL5S006Lfxa3ah57lFr7/++mhzhhaQuuQ4St1n\nZfHn2C3oj49dtz4zgSvYsvvQV9eszeKm9Y1fTI7JysIC0nPFbkwAuPzyy6vdn2/H537mzJmZx7H5\n5ptHe8mSJdH2btcsvCTjq4NmtS26S746WM7w5zFrIUmfdTJ27Nho52UI+utF1ByfHcf3Nb4f++rX\nWWPLV5zn/uPP+CzWvOr5RcBXL+fMKw6P8BlVhx9+eOY+uC9ZtvQSGb/OG79ZFZ79s5D7uU+fPtG+\n7777knbc5z57K28B69pQ7KtLCCGEEIVBkx4hhBBCFAJNeoQQQghRCOo9psfr8Hmpj0yerltpLMW/\n/vWvaHM1SF81lGNOli1bFm1esRtI02I5XsTrp3mptXw+uEKpr/7MqxU3FSpdZZ01aADYe++9o80r\n2gNpSj/3pdeauc/zqsPyuec4IL8/3kfbtm2j7VPZ/TXAzJs3L9pbbbVVZrsiwWOaYz8qPT++xAP3\ne979Qqw5nTt3Tl5z/2Xdt4DsUg7+Pshxipy+nFdluIj4WFJO/c+LMdx+++2j7e+zWfdM/5zl5x/H\nD/l2HHfDx5QXd9e7d+9o+zgd/lxeHGxdIE+PEEIIIQqBJj1CCCGEKAT1Lm/luaTZreldnCx1+H1k\nSVo33HBD8nrWrFnR7tatW7T9IqDsWmO3m0+l5IqXfEy8oCaQuifz5D3m4YcfTl43RXmLXZ8ePjf+\nvJ100knRHjVqVLLNn7sq/PVQqQuczy+71728xS7Zr3/969HOqtRcHSxxSt4qkVUFfYcddqjo8wcf\nfHDy+oorroi2ZJD6xY9Ffs3yiO8HXhSW8WU3eAyyhKFq2um9ysuFLPHy88Sndnfp0qXadh6WJ71c\nxmOW+yWvbIwPZ8g69q233rraY/Dt/O/n8BG288Ic8pCnRwghhBCFQJMeIYQQQhSCOpO3slzP3i3G\nbjeOCK9JFU5ekG3kyJHR9hHh22yzTbTZLealDpa7shbZA1Z3yVXhj51dt34bu3x5/08//XS1+25K\neFmQ4XPfsWPHZBtnAXj4fOdVua70+siq1u33x9fALrvskrk//l5feVZyy+pkueJ79epV0ecHDBiQ\nvOYMorysyKZYwby54cMG+Jzyte77oUOHDtXuj++/QHqPyKroW1RYKs8LieDx5eUt7hffRyxj8Zjy\nEhFLmtxH/tnK93i+bvyx8zaW3/Lu536hWv79nIXNcllNkKdHCCGEEIVAkx4hhBBCFAJNeoQQQghR\nCGoc01OlKXr9tzYxF4yv9svVbl9++eVkG6/2zelyXOUTSFOs33vvvWj7lDiO7+DfxccApDopV/H1\nKXtZcQ1AqlfmVRmePn06gLpfYXZN8CnrHOPCuryPr3jxxRcz98lac1ZlV6Dyarx8vvOqf/NvqbSs\ngu9L1uGLiq+gzCmvfE9gPT+PvIqziulpWPh8c1q674esmD2uEAwACxYsiDbfj30MRxHh+5E/v3yf\n5Xbdu3dP2vGq9jwOgbQ6Pe8/r1QM39P9c4jb5d3DuZQJx9Xy8fj98XMRSO+7ixcvjrZieoQQQggh\nctCkRwghhBCFoMbyVlY15EWLFkV7/vz50fZuNn7NLrO5c+cm7fKqRrIbj91zfmEy3j/vw6ees3uV\n0805tQ9IF+Rj96zfH7t7/WKk7CZmSYtT8bidd/U1JpWmaG+77bbJ69mzZ2e2ZWmJ959X6iCPrIrM\nfhE73p9PsWfy5K1KF2BtyfhzN2fOnGjz+efq6HnkVXfNk76yykmI2sMlKjj93FdVP+2006r9/E47\n7ZS8njBhQrS5artKP6T3LR/qwPcuDvXo06dP0o4/lzdW8qof83Hw93oJkqWvvNIg/LxjCbpfv35J\nO5bBvFzK+/TP09ogT48QQgghCoEmPUIIIYQoBLWuyPzoo48mr7lKMrvWvATArrCsCHAglbC8S4ul\nIJYcfKVldpOxC9Xvj4+JXXDezcgZW5VKG95VxxktLL95KS3PPdlYeFdo1jF6eWv06NGZ+8zKyPFS\nEvdfXqYgf47tLFkWSDOQfDZSXoZWXbhamzs777xz8poz9dg9XpOFXLPw45vx8qVYc3jcvvrqq9H2\n8tbNN99c7ef79u2bvGap449//GO0fRXuQYMG1fxgmzksJfr7Kj8nOITDnzd+JnH4BZBKRHwf92OK\nKzLzcfh7Lh8T35t9lWi+77722mvR9gs0P/PMM9XuG0hlPP+7aoM8PUIIIYQoBJr0CCGEEKIQaNIj\nhBBCiEJQo8CR9957D4888ggA4G9/+1uyjXU3Tu3m2Bwg1f84PTWvCqPfB8e/sNbIaW9+H6wT+rQ6\n/m6OF+I0fACYOXNmtceQl1bu44I4ZZ8rbfp2VanAXiNtTHzaYlacjNd/X3rppWj737OmKfn+81lV\nmPNipDheYfPNN0+28fXgj11p0sAee+yRvP773/8ebR7fL7zwQq32z9dSXkXmSivCi2x8HB2PLR4j\nfvV0vo8xfsxxPAqnr+f1a1F4/vnno+1jWvg1P5N8vOjEiROjzbE5QDo+2Pb3Tx6z3C++Hb/meDof\nW8fXwJQpU6LtV0/gZ4v//Xyf5d941FFHoTboTiGEEEKIQqBJjxBCCCEKQY3krQ033DCmqD777LPJ\ntmnTpkV77NixmftgiYBlq3bt2iXt+HWbNm2SbSwtsUuW0/6AtHolu8jy0vnYBde/f/+kXc+ePaP9\n3//+N9o+7S/P1c7uPl6E0bv7qqS6plSR2burs47Np7Zzqqp3u9amGmuli4+y/JbnQr/vvvuizX0M\npG5n369Lly6t6DhaMsOGDUtes9TB5z+v6nUePC7yqnKrqu+a48cV32dZcqi0PIAvw8H3BZa68hYa\nLgpcKoWrHQPAG2+8EW0O4fAp6/zs4vIqQLYU78cUP8uyyssAaTgGS2K+HV9TvID3YYcdlrT7zne+\nE+1vfvObyTZ+ZnDoTG2Rp0cIIYQQhUCTHiGEEEIUghrJW61atYpus4suuiizHVeqHT9+fLKNJSeu\nwsiuLwCYOnVqtP2ipeySY/eZlx9YIuMFzvbdd9+k3cEHHxztrEwED7vnuNIkAGy22WbR9rIVS3os\nF3mXce/evWt0PA2BP7/eDVsFZ2sBqcvU/052bbNrNE/KyKq6DGRLX3nyB197XtK86667MvcttzzQ\no0eP5DVf79zv/lrhhUl79eqVuX+Ww/POd1OSgVsKLFtwSABLMXn4bEe+33Ff+ozJInLyySdnbuPn\nKY8bX9V45MiR0faZXbwPvhd6GYwr0LM86ccvhwuw7Z8RLGtzSIxfpJarSftM5rp+BsrTI4QQQohC\noEmPEEIIIQqBJj1CCCGEKAT1spQ3a3L77LNPso1fn3HGGfXx9Q3C/fff3yDf05Qqzfp4nKw4GZ/K\nzXqw30elVZ35dVbVZf86L/aHyyCMGzcu2lWxVNXhv8tXDhVpHA9r/T59udKYHk5R5dgrH7OgmJ66\nh6vk8hiuNMaCY4KAdAxmVeYXq8PPU4459CsQcMkWXwImK4bKp7LzPri/fB/x/ZjHfF5f8ndNnjw5\n2cZxtfVN03miCiGEEELUI5r0CCGEEKIQ1Iu8JVomPgWVK2VySuS5556btHv00Uej7SWhSuW7LEmr\n0iq9/nu4Iuyee+4Z7UMOOSRp96tf/SraXorzlbiLQlbJCAA44ogjon3rrbdG20uhXLXdl5BgfAXv\n6o4BWF3uEmsOL7bL8mGl1a996nHWQpd+IeOi469tPt98D/IrH+QtqszjiPfHC8kC2VIzXwt+Hyx9\n+nIG3Lddu3aN9lNPPZW0Y3mr0jIktUWeHiGEEEIUAk16hBBCCFEINOkRQgghRCFQTI+oGL8cCOvL\necsFdOjQIdqvvPJKso015LpYKTsr3sTHI3FaPZdKb9++fea+fVzQ/Pnza32czZm8mJ7DDz882jfd\ndFO0fSrr3XffHe1f/vKXmd/FsR95pQoqXflbVE6nTp2ivXjx4mhnlZnw+DirrDRnHn9i9Ws763zz\nkk5AuqSEjzfkeB/+3JZbbpm045gcXt3d74/v1Ryn6eNxeNyz7WOEGP/78+43tUGeHiGEEEIUAk16\nhBBCCFEIJG+JivnKV76SvOZKxlyl1Vc1njVrVv0eWB3ClYIBYOONN462d/HuvPPODXJMTY28UgAH\nHXRQtFne8Oeu0lIFffv2jfa0adOi7asCv/XWWxXtT1QO9+XEiROjXam8xWMHSKugc5pzjx49anuI\nhYAlXj73Xl7nMebvwfy5Pn36RNtXbp45c2a0WUryIQssl3E/cx8DaSV2Pj5fCZq3eala8pYQQggh\nRC3QpEcIIYQQhUDylqgYL+dw1D5H5jelRVJrinfjstvVL5rpq48WhUrlDZYtnn322WQbu7efeeaZ\naA8bNixpx659lkR8XyxZsqSiYxKVwxIin/tK+9/D9wvu/y222KJW+ysKWZLOZZddlry+8soroz1q\n1Khk27Jly6LNGVu+ijP3EWfV+UWk33vvvWq3+awszijjzNgzzzwzaZeXfVnXz5Pm+3QSQgghhKgB\nmvQIIYQQohBo0iOEEEKIQqCYHlExvEouAOy4447RZv0/L9bl888/T15zfEDeiul1DX8XH8PWW2+d\ntPvqV78abdbFAWDo0KH1c3BNnErTRr/73e9Gm9NkAeCYY46Jto/jYU444YRoL1++PNp+Be/dd9+9\nomMSlfPtb3872ryiN6ey14TDDjus2vf79etXq/0VhayYFr86/UUXXZS5j9deey3anJa+aNGipB3H\n6uRVyOcK92x37949acdlTvyYbSzk6RFCCCFEIdCkRwghhBCFwGoiKZjZ2wCKucpi49EjhNDhy5vV\nDPVlo6H+bDmoL1sWdd6f6stGI7MvazTpEUIIIYRorkjeEkIIIUQh0KRHCCGEEIWgSUx6zOwIMwtm\n1ufLWwNmNs/M2lfz/gc1/N4atc/Zz0lm1qUu9lVkzGxzM7vNzGab2Uwze8jMen/5J5N9tDWzM+rr\nGEUJM9vMzCaX/y00szfo9bpfvgfRlFB/Ni9q219m1tPMpmdsu8TM9s3YttozzsyONbMLzWxPM8uu\nO9HEaCp1eo4FMBbAMQB+2biHUitOAjAdwJuNfBzNFisVf7kHwE0hhGPK7w0E0AnArBrsqi2AMwCM\nqONDFEQI4R0AAwHAzH4J4IMQwu+qtpvZ2iGEz6v/dN1jZq1CCCu/vKWoDvVn8+LL+quW+6y20I+Z\ntUL1z7gDAVwD4FAAHwB4ZrUPN0Ea3dNjZhsB+AqA76A06al6f08ze9LM7jKzl8zsFnNV0cystZn9\nx8y+63YLMzvfzJ4zs6lm9quc77/KzJ43s8fMrEP5vYFm9mz5s/eY2aZZ75vZUQAGA7ilPMtunfVd\nIpe9AKwIIVxf9UYIYTKAsWZ2pZlNN7NpZnY0ULpuyn32fPn9w8sf+w2Arcp9ceVq3yLqDTO70cx+\nb2ZPAPhtzjh60swGl+32ZjavbO9gZhPKfTfVzLYpv388vf+n8k0YZvZB+a/T8QCKWSmyHlF/Nm+y\nzj+AVmb2FzObYWaPVD2zyv19VNmeZ2YXmdlYlJwSyTOu/CweCOBdAKcDOKe8bXcz61G+N08t/9+d\n9n+9mY0xs1lmdkgDnxIATWDSA+BrAP4TQpgF4F0z24m27QjgbADbA+iF0uSoio0APADg1hDCX3iH\nZrY/gG0A7IxSxwwysz2q+e4NATwfQtgJwGgAF5ff/weAC0II/QFMy3s/hHAXgIkAjgshDAwhfAxR\nG/oCmFTN+19HqQ8HANgXwJVm1hnAJwCOKPfdXgCuKg/EnwCYXe6L8xvkyAXTG8C+IYTzkD2Osjgd\nwB9CCANRusm+bmbbATgawFfK768EcFy5/YYApocQdgkhjK1mf2LNUX82X1Y7/+X3twHwfyGEHQAs\nA3Bkxuc/CSHsFkL4J1Z/xu0IYEoIYS6A6wH8b3nbGAB/BPCP8nVyC0reoCp6AhgO4KsArjez9dHA\nNIVJz7EAbivbt5VfVzEhhPB6COELAJNROmFV3Afg7yGEf1Szz/3L/14A8DyAPih1tOcLALeX7X8C\n2M3M2gBoG0IYXX7/JgB7ZL1f6Y8UtWY3AP8KIawMISxCaXI6BIABuMzMpgJ4FEBXlKQw0bjcGUJY\nWcvxMg7Az8zsApTqbHwMYB8AgwA8Z2aTy697lduvBHB3Xf8AkaD+bL5Ud/4BYG7Ziw6U/tDsmfH5\n2zPeB0rS1qiMbUMB3Fq2b0bpHl7FHSGEL0IIrwCYg9KzuUFp1JgeM9sMwN4A+ppZANAKQDCz/1du\n8ik1X4n0eJ8GcJCZ3RpWLzZkAC4PIfyphoekokWNxwwAR1XzftZCT8cB6ABgUAhhRdml3uB/NYjV\n+LCCNp9j1R9csc9CCLeWpY2vAnjYzE5Fqf9vCiH8tJr9fKK4j3pH/dlMMLMjsMr7dmrG+Z+D1Z+r\nWSEZeX2/P7I9RJ6QYVf3ut5pbE/PUSi5wXqEEHqGELoBmIt0ZpjFRQDeQfUBqw8DOMVK8UIws65m\n1rGadmth1YP2WwDGhhCWA1hqZlUrGJ4AYHTW+2X7fQAbV3DMIpvHAaxnFJ9lZkMALAVwtJm1slLM\n1R4AJgBoA2BxecKzF4Ae5Y+pL5oAXzJe5qH01z5AE10z6wVgTgjhGgD3A+gP4DEAR1WNXzNrZ2ZV\nfS0aCPVn0yeEcE9ZYhoYQpiYcf5rS7yvlr1+a5eDqZNtZZ7Bqvjc41BKUqriG2a2lplthZKH7+U1\nOKZa0djZW8eiFHjK3I3SBCTPtVbF2QBuMLMrQghV3iGEEB4pa8fjSmEe+ADA8QAWu89/CGAHM5sE\nYDlKWjMAnIiS3rgBSjPjk7/k/RvL738MYKjiempOCCGU/1K52sx+glLMzjyU+ngjAFNQ+qvg/4UQ\nFprZLQAeMLOJKEmfL5X3846ZPW2ltMxRiutpVLLGy+8A3GFmJ6A02a3iaADHm9kKAAsBXBJCeNfM\nfg7gETNbC8AKAD+ASvs3BurP5sVq5x/AJrXc141Y9Yy7CqWQgioeAHCXlZJJfgjgLJSey+cDeBur\nrhOgNMkZjVIowukhhE9qeTy1RstQCCGEEKIizOyvAP4aQni2hp+7EcCD5eSfRqOxPT1CCCGEaCaE\nEE5t7GNYE+TpEUIIIUQhaOxAZiGEEEKIBkGTHiGEEEIUAk16hBBCCFEINOkRQgghRCGoUfZW+/bt\nQ8+ePevpUCrn5ZdX1TMyWoPU0vVIwUHa6667brXvA8CKFSuivdZa2fNA/tw221S3qkXdM2/ePCxZ\nsiSrKnGtaSp9yaxcuaoYa6tWrZJtn366qojo55+vWuzZ9zm/bt266a39OmnSpCUhhA51vd+m2J/M\nO++8k7z+8MNVxV55XPl+X3/9VUW227dvX09HVzuKNDaLQH2MTfVl45DXlzWa9PTs2RMTJ06sm6Na\nA/bcc89o801yvfXWS9p98smqukd84fH7ALBo0aJob7zxqsKS/BD2rx966KGaHXQtGTx4cL3styH7\nkh9qfpLCLF26NNqbbrppsm327NnRXrJkSbT9Q5KvgX79+tX8YOsZM6uXImwN2Z9ffPFFtP0fEL4/\nqvjHP9Il8saNGxdtnsT6fu/TZ9XSPKecckrmMVV6jWV9piafq6IljE2xivoYm+rLxiGvL5tFnZ73\n3nsveT1jxoxod+iQPTH/+ONVhZH5ocl/PQKpd2eDDTaI9meffZa0y/susQo/WeQHoX/Q8CSFPW7e\nS8N92bZt22o/AwDrrLNOtL/73biiBa644opKDl1UQJ43lJk6dWq0TzzxxGTb0KFDq90f9x8A/O//\n/m+1+/CTK56wVDoBqukkRwjR/FFMjxBCCCEKgSY9QgghhCgEmvQIIYQQohA0i5geH3jMWjzHj3CG\nln/NAZI+5oRjhji+wEfdN8VsoKZIVjArANx+++3J64suuijaHANy5513Ju3OP3/VYukvvPBCtB99\n9NGk3b777hvtM844I9ocLAsAa6+96tKvTRCsKPHSSy8lrzkpoGPHjtEeP3580u7iiy+O9vLly6Pt\nx9hf//rXaD/11FPRHjt2bNLuggsuiLa/DwghRBXy9AghhBCiEGjSI4QQQohC0Czkrbvvvjt5zYXO\ntthii2h72YrriXBqNL8PpOnQLIOw2x0A3nzzzWhPmjQp2oMGDcr/ASLCshIAdOnSJdo///nPo33w\nwQcn7f7zn/9Ee+7cuZn7HzFiRLQrLQomSSsfvtYB4N577402jwkA+MpXvhLtZcuWRbtdu3ZJu223\n3TbaixcvjraXtwYMGBBtLiGxySabJO24JMHw4cOjvd122yXtmlqBQyFEwyJPjxBCCCEKgSY9Qggh\nhCgEzULe4gwOAOjcuXO0OUOEM0eAVEpZsGBBtLnqMpBmG3G1Zi/FsBt+woQJ0S6KvJVXtp+lh+ef\nfz5pxzKHz8R79dVXoz19+vRo+2U+uAoz9/+sWbMyj5fXaOO1u4BUVuOqzp06dUraVVp9uKXB2VD7\n7LNPso0lIpapAKBv377RnjdvXrRvvvnmpB2Pmd69e0fb99P9998f7QMOOCDaXrZ69tlno80Zffw+\nAHzta1+LdkOtnyeEaDoU844uhBBCiMKhSY8QQgghCoEmPUIIIYQoBM0ipodjMwBg8ODB0eZ0c7/i\nNqemb7TRRtH2q6dzmnqbNm2qtYE0vsOn6haBvNTumTNnRvu5555LtnHcB8dvAMDAgQOj/cYbb0T7\ngw8+SNpxmvSOO+4Y7SVLliTt+HrYcMMNo81lDgDglVdeiTZX8PWrfBcpxXnatGnR5lia3/72t0k7\nLgXg49569epVbbulS5cm7U4++eRoz5kzJ9offfRR0m7y5MnR3mWXXTLbcYxW165dq/08APz+97+P\n9nXXXQchRLGQp0cIIYQQhUCTHiGEEEIUgiYrb7311lvR9pWWOU2d08h9ejHLFpyyzmnpQCp9sUTm\nF6nkz3GFZ5HKF1tvvXWyjaWqDh06JNt4sdfNNtss2l5WmjhxYrS5XACnSAPA22+/He33338/2rzg\nrP8uvm5YHisaXHmZK2DfcMMNSbv77rsv2nwegTSVnBcjfeCBB5J23O+c2u7LTrAMyeUEvOTNEhlX\nf95+++2Tdl/96lchhCgu8vQIIYQQohBo0iOEEEKIQtBk5a2FCxdG21dQZrhKsF+skDN7OOOLK/8C\nqfzCUpeX1bIqNxcVPm8sJXHFZCDNBOrXr1+yzVdoroL7AUgz7liC8tlW3GecbeavIX7NmUA+K6hI\nPP7449Hecssto80ZdkCa1ej7ieXG+fPnR9tfE3vvvXe0Z8+eHW2fgckZZSx5ehmMpS+/D+b111+P\nts/8K1KmnhBFRZ4eIYQQQhQCTXqEEEIIUQg06RFCCCFEIWiyMT28eraP2+BKu4yvGMxp7xw3wBV9\ngTT9tUePHtHmlHcgrT6rlPV09XReHXvzzTdP2nH8BaeUA2lf5sVMbbLJJtHm68Gv/M4p1Bx7wtW5\n/WuOF/LxIPy7Wnqfcxo5l3jgeDggjc/xMVlt27aNNpcx8GOYVzhfvnx5tH3sFY9Njhvj7wHS62X4\n8OHRvvvuu5N2nALvq3Qrpke0VPg+ybYv81IbnnrqqWjvsccea7y/Svnwww+T11nzAo88PUIIIYQo\nBJr0CCGEEKIQNFl5i6u5epc3u7VYEmG5BVi9+m8Vu+66a/KaFyVkiYylDb/NS19FhM83nw8vPXA1\nZH9OeVue25VlFC5N4GUwbsep7f4aYsmNK2/7Y2fJJ+t6ailkSVMPPfRQ0o7Pg69gzdImV1pm27/m\nsc7VlIG00vKpp54abb/gL4/h0aNHR/uZZ55J2vF14K9FIVoq/OzKWziaOeuss6L92muvJdt23333\naD/22GPR5lIXANCtW7eKvovvwX4RY+bKK6+M9p133pls45IbecjTI4QQQohCoEmPEEIIIQqBJj1C\nCCGEKARNNqbn1VdfjTanHgNpijGnwnqd/6STTqp236ecckry+vrrr4+2T21mOH6I7aLC8Rwc0+PP\nDbfzpf85PoTjePJ0Z9Z8/VIh3H+cYu51Yo4FytOQi7TcyKBBg6J94oknRtvHxXCczbvvvpts4zIR\nHBfES5YAaTwYp6L7/uS0cl5CglPPgXT5kKzlZ4A0bsnHDwnRnPDPqtrE7fBYBoAhQ4ZE+1vf+la0\nd9ppp6Qd3+O5TMgPf/jDpN29995b0XHk3YNvvvnmaN92223R9vcUjg3MQ54eIYQQQhQCTXqEEEII\nUQiarLzFqcJ+9XR23XEFXV9N9+yzz6523+zC8/tjl6GXaVgukbyVpofnVVPm9HCWPIBUbmApgyVM\nIHV/8nf50gEskXFfckVnIE1r5grd3i3sKz63JHgFcwD417/+Fe1jjz022t6NzumlXnrmVdd5m+/P\nvCrYTFaFbe8O537na+LAAw9M2i1cuDDaTzzxRLLthBNOyDyOloKvQs3yBkuEQJqm3Ldv32j/+c9/\nTtrxeevSpUu0/bXB5SkYf31VWiWYx2alck5TJe+3ZG3LO09+vPF1z/c7/4y84IILot2/f/9o+5IT\nXDZmu+22i/ajjz6atOM+/9nPfhbtr33ta0k7DlMZO3Zssm3EiBHVthswYEDSrmvXrqgEeXqEEEII\nUQg06RFCCCFEIWiy8ha7r9llDqQuPs4M8gtd9urVq6Lv4oUG2dXqszvYNewr0RYRdqFypVsvCbFU\n6fuIs3jyXLd8PXAfeVcwuz9ZhvHcdddd0e7du3e02T0PrL6gZkvCL9jHLvAbb7wx2r4i88UXXxxt\nPncA0KlTp2izbPXGG28k7YYOHRpt7uuOHTsm7XgM8iKlvh3LpkcccUS0X3zxxaTdlClTou0zUpqS\nvJUlq2bJOD7rjSVgrlR77bXXJu14IWZ/PbBMuNVWW0Wb5WogXeD1j3/8Y7S91HH//fdHm6viVyrT\neCm7uUtaTN5vydo2ZsyYzM/wGAVS6edvf/tbtL20yBmSEyZMyNw/P//4Wv3qV7+atGOJ87rrrov2\nDTfckLTbeOONo+0zfLt37x5tvm+MHz8+acfPmTzk6RFCCCFEIdCkRwghhBCFQJMeIYQQQhSCJhvT\nw6mqebEZXJXRp6dWCseZsBbOsT5AWn02r3JzUeBVqrNWSwfSSpk+nZ3LALBO7GMUGN7m4wG4X3ws\nGHPPPfdE+7zzzou2jxvwVT9bEttvv33y+vLLL4/2/vvvH22/uvzdd98dbZ+WvMUWW0Sb++bWW29N\n2nG8HceVcEVnII1b4GtswYIFSTuu6swcfPDByeu99tor2v73N0UqTef2JTSef/75aF999dXR3nbb\nbZN2Rx99dLS5IjeQlpPguK5x48Yl7f7yl79Em2MzOL4LSGOteDXun/zkJ0m7ww47LNp+PBYRXp2A\nYyC5xASQ3md/8YtfJNs4Xotj93wcFz9r+V6dV/me4x75mQAA3/jGN6LN/fryyy8n7fge4Fdm33ff\nfaPN95vbb789aVfptSJPjxBCCCEKgSY9QgghhCgETVbeYmnCV/FlFxy7/q666qrM/bE7zruI2dXK\nKXverc8uPm5XVDiVkiseexcnV/Nk97dvy25STj0H0j5jO68ydl4aPUuanE7NVUiBli1j+kU7Z82a\nFW0+r4sXL07a8fjzbm+WKHkfXo6aMWNGtNkt768d7jdOgedqwUAqPe+www7R9hIL/+apU6cm23zf\nNyZVY6u2ld9ZquJSG7VdZJUXoGXbM3fu3Gj/z//8T7Jt8uTJ0WbZmGVVv4/OnTtH2y9uy9dGnvyS\ndQ0BwN577736j6hDPv3003jN8WKZQFp2ge99XqrlY+b+Y6kWSFca8OnmfM/ke7W/vvg88vlmScwf\nI495P355Gz/Tvcy62267RdtX7ubj4AVMOQQGSO8pecjTI4QQQohCoEmPEEIIIQpBk5W3OKvHV8Vl\n1yi7LvOyMfIyftgdzq5VL8W8/fbb0c5aPK8l4/uBzyn3V15lTL+o4YYbbhhtrrrs5a2sRSl9Zh9L\nbhzN7ysCv/nmm9HOkyqLJG9xtgaf7zvuuCNp95vf/CbaPHaANOOHzx1LjQDwrW99K9ovvPBCtccA\npOPxoIMOijZXZgVSF/g555xT7b6B9Prz1xhnxvDvaGg+++yzeE2y5Aik55EXYvb3NF5IkiWHZ555\nJmnHv9mPb74GeKx76YSlD5ZO+vTpk7Tbb7/9os3VtTnjD0glDM7e8/dj7r+8RXF5G/9eYPXFp+ua\nxYsXx0rEXA0cSO+ZjF9MlzOW+Bm0fPnypB3LZXxfBdJxNH369Gj7ex+fH5am8u79jP9NfI0OHjw4\n2s8991zSjit55z2f86r2b7311tUek0eeHiGEEEIUAk16hBBCCFEINOkRQgghRCFosjE9/fr1i7Zf\nTZX1RdaG/QreTN5Kvly19Zprrom2jz9h7bq2qZ/Nmbwqyazl+zgEhjVeINWAef++EjKnVuaVH+Bt\nrOvzKsNAmsrsY1sY1pDzVrJujkyaNCl5zdc0pzn76qkcc8AreANpKir34ejRo5N2O+64Y7T5evEx\nF3wce+yxR7R9VWCO3+JVmX1MD18HfjVnjpdozJieVq1axfReH0vBqfocj+HHAd8/eVVtD8f7+Iq2\nHBfC8SLf/OY3k3Zc8oNTzGvLaaedFm2OD/SxLnmV+jnWM2vFeqD++3nTTTfFUUcdBWD1ZwaXceCy\nLD5lneMPOb6HS4H4bRzDA6SVl/k8+uuG98HxY3w9AWm6PI/RkSNHJu0eeeQRVAL/Zv/cZfiarG31\nfHl6hBBCCFEINOkRQgghRCFosvIWL4T397//PdnG7jl2f3pXOy+amOfi5NRKXuzMu/54H1kLHBYJ\nPj/sCuXFDj1e3sqq4OvTibOkJO/y5mPK63OuDurlGyZLLss7puaCT/veddddo81prVwtFUjLNUyb\nNi3Z9tlnn0U7qyoukEqFfO2wxOTb8fn3JQzY1c1968snsJvej2GWcBqTVq1aRdnFL5haNFg2b460\nbt06plz36NEj2ZYlBXoZnSXOOXPmRNtXSh81alS0TzrppGQbVxvnSsZ1vaDroYcemrz+z3/+E+0B\nAwZE2z9b+V7qF4rmewfLdH5x4jxZjJGnRwghhBCFQJMeIYQQQhSCJitv5UkdHKXN7W6++eakHctb\nXgZh2rdvH23O0Jo/f37m9/rKsUXASwosX3CWk18YkPFVVdldydKId/GyGzYva4OvB5bO/DXELt6s\nas9A5XJZc4QXgATSiqa8zWe+sVvZV7pmlz3LR36BUM484kwTXyGWXdaLFi3K3B/3Z+/evaPN1xSQ\nVv/145szaDiLRYja0qpVq3gtean1scceizbf0/y9ijPM+vbtG20v55x55pnR7tWrV7KNxwFnLeaF\nafD9k20gvRfys9XfKzg7l6trs9QFpPfgvEVQ+Tf7bC2/AGkW8vQIIYQQohBo0iOEEEKIQqBJjxBC\nCCEKQZON6WG8dsf6IsfW+NV/awOnB/qKtaw7eo2zCPj4CI7pYV0379z4WAmOoeL4Gd/nXDmW23Es\nEZD2ER+fT33l4/DxIQz/rpa24vqDDz6YvGad/g9/+EO0DzjggKTdoEGDou1TT3faaadoc8XZnXfe\nOWnHKyfzefXXB8c6cByAjxvjNHpOez/33HOTdlyewMcj/exnP4t2z549IURdwuVQqntdxauvvpq8\n5vspV4/31ct5LPoyHHz/5Huhjx/i+ymnjvtq0nyf5fsG388BoEOHDtV+l7+X8j44ts7DMaH+mLba\naqvMzzHy9AghhBCiEGjSI4QQQohC0Czkra985SvJ61tvvTXa7OKqi+qS7Nb2bjZ2EeYtvtlS8Snb\nXlqqwlfK5EVh/Wc4PZGlJF9igF/zuc9LI89bZHa77baL9ksvvZTZriXLW7/73e+S11yhmeVF7zZm\nt7pP92e5mVNt/WLAnNrK55UXVwTSNF++drw0wLI0y7Cnnnpq0o6rS/v+9JWnhWgMuHREHn4RUFEZ\n8vQIIYQQohBo0iOEEEKIQtAs5C2uNAkAd911V7RZwvDR7Lw4m69QmQVHh/tqlewO52yRolCpvOWz\nobgKrt9HlmToM8Cyqj97iYK35WWRcWYCf6+XLVl+y6sE3Rzh8QGk0hSfh2233TZpx5VkR44cmWzj\nxWZZqrrxxhuTdiwdc5bXiy++mLRj2Yr356tJv/POO9HmSux+AVOu6uyzEfn+wVknQoiWgzw9Qggh\nhCgEmvQIIYQQohBo0iOEEEKIQtAsYnr8yq2cCsuptV6j5wrNlcb05K3mzfEn/ruKSNbq5D6WhlMw\n/Qq6vAovx+P4drwt79z7z2Wx4YYbVnu8fuViTllvaX3OK9wDafwL24MHD07acdVlLkcApGnfU6ZM\niTbHCwHAMcccE+0ZM2ZUu28gjS361re+lXlMXKH5wAMPrHbfQHq/8L+/iFXWhSga8vQIIYQQohBo\n0iOEEEKIQtBk5S1Obfap0fvtt1+077777mj7isz33XdftNmdngenMvvqsHxMLa06byVw+jCQLSXN\nmzcveT1s2LBoz507N9nG1Ztbt24dbV8SgKU0ljy8BMntsuQ3/13Lly+vdt/A6pWhWxK+JAOnjvOi\nhxtssEHS7uGHH462P1/cH7z44Pbbb595HLx/X2WW0+pZ1u7YsWPSjlPR+Zri8Qyk5RT87/fXtxCi\n5SFPjxBCCCEKgSY9QgghhCgEmvQIIYQQohA02YAFjpnxsSMHH3xwtHlJCo7TAIDXX3+9xt/bpk2b\naPsUZY4z4RTZouDjZ7KWLfCxEZxe7Jeh4Dgs3oc/vxybwdeGTzvmOA1eosQfE6dG8wrgHNcCpEsw\n5MUINUd8/Myuu+4a7VmzZkV7nXXWSdrxyuc+jo7jo8aNGxft9u3bJ+0effTRaHMauS8tMX78+Ghz\nLJ/vJ44j6927d7SHDx+etJs5c2a0N9lkk2SbX01eCNHykKdHCCGEEIVAkx4hhBBCFIImK2+xNOHh\nqq9crdmvss4ps1wddsCAAZn7Zpe3r87Lbn5Ony0KvnQAv+b0fi8LHnXUUfV7YMRmm21WUTuW3Fhe\nefzxx5N2LAF5Ka2507179+Q1r57Oqd1+LE6dOjXaXbp0SbbxmGHJqV27dpnHwbKpr4rMr1m69GOT\n5S6WULniN5CmtvtK775MghCi5SFPjxBCCCEKgSY9QgghhCgETVbe8lJKFuyinzx5crKN5aj//ve/\n0c6Tt9iFnrcAIbvJi8L8+fOT15ypw9LiL37xi4Y6pDrhRz/6UbS33HLLZBtLpL4Kd3OXQ3z21rXX\nXhttXqzX8+1vfzvazz77bLKNMy1ZNvSy4+zZs6PN49TLVvyaZba8zMo+ffpEm6U4/7pnz57Jtkrv\nOUKI5os8PUIIIYQoBJr0CCGEEKIQaNIjhBBCiELQZGN6KuXCCy+MNlfWBdJYAV+ZNYujjz462p06\ndUq2cZr6PvvsU5PDbBH4Fau5QjGn+u+5554V75PTixsrpuLII4+Mtq8w7FcRb0n4FeS//vWvR9uP\nJaZv377V2p5TTjkl2oMGDUq28bXDae8+zqZz587R5pXafbtDDz202mPw38txQd26dUu2KaZHiJaP\nPD1CCCGEKASa9AghhBCiEJhfADK3sdnbAOZ/aUNRl/QIIXSo652qLxsN9WfLQX3Zsqjz/lRfNhqZ\nfVmjSY8QQgghRHNF8pYQQgghCoEmPUIIIYQoBE160mNmm5nZ5PK/hWb2Br1e98v3IJoTZra5md1m\nZrPNbKaZPWRmvWu4j7ZmdkZ9HaOoHPVn88PMLjSzGWY2tXyf3aUO9vmkmQ1e0zaiMuqjD2nfe5rZ\ng3W1v8agSdfpCSG8A2AgAJjZLwF8EEL4XdV2M1s7hPB5Qx2PmbUKIbTcwi2NiJWKpNwD4KYQwjHl\n9wYC6ARgVg121RbAGQBG1PEhihqg/mx+mNlQAIcA2CmE8KmZtQegPy6bEU25Dxv6eZ1Fk/b0VIeZ\n3WhmvzezJwD81swGmtmz5VntPWa2abld/MvBzNqb2byyvYOZTSjPgKea2Tbl94+n9/9kZq3K739g\nZpeY2XgAQxvlRxeDvQCsCCFcX/VGCGEygLFmdqWZTTezaWZ2NACY2UZm9piZPV9+//Dyx34DYKty\nP17Z4L9CVKH+bH50BrAkhPApAIQQloQQ3jSzi8zsuXKf/bk8oa26x/62fN+cZWa7l99vXfbwTTWz\n2wG0rvoCM7vOzCaWPRG/aowf2cLJ6sN5ZvYrGl99AMDMNjSzG8r9+0LVuDOznmY2ptz+eTMb5r/I\nzIaUP9PLzAaZ2Wgzm2RmD5tZ53KbJ83sMjMbDeBHfh+NQgihWfwD8EsAPwZwI4AHAbQqvz8VwPCy\nfQmAq8v2kwAGl+32AOaV7WsBHFe210VpQG4H4AEA65TfHwHg22U7APhmY//+lv4PwFkA/rea948E\n8F8ArVDyEryG0sBeG8Am1L+vAjAAPQFMb+zfU/R/6s/m9w/ARgAmo+SJG0H31XbU5mYAh5btJwFc\nVbYPBvBo2T4XwA1luz+Az+le3K78f6vy5/vTvgY39jlo7v9y+nAegB+W7TMA/LVsXwbg+LLdtvy5\nDQFsAGD98vvbAJhYtvdE6fk7DMAkAN0BrAPgGQAdym2Opv5/EsCIxj4v/K9Jy1s53BlCWGlmbQC0\nDSGMLr9/E4A7v+Sz4wBcaGZbABgZQnjFzPYBMAjAc+U/YloDWFxuvxLA3XX+C0Sl7AbgX6EkKy4q\n/8UwBMAoAJeZ2R4AvgDQFaWHqGjaqD+bKCGED8xsEIDdUfLU3W5mPwHwvpn9P5QehO0AzEDpj0QA\nGFn+fxJKE1QA2APANeV9TjWzqfQ13zSz76E0ye0MYHuU/nAVdUBOHwJpX1WtObM/gMPM7Mfl1+uj\nNJF5E8Afy5L0SgAci7cdgD8D2D+UvEh9AfQF8N/y87MVgLeo/e119wvXnOY66fmwgjafY5V8t37V\nmyGEW8tS1VcBPGxmp6L0F+VNIYSfVrOfT4LieBqCGQCOqub9rAWRjgPQAcCgEMKKsny5fkZb0fCo\nP5sh5XvdkwCeNLNpAE5DyVszOISwwEqxldwvn5b/X4n0ebJaATgz2xIlb/2QEMJSM7sR6uM6p5o+\nPLG8qbq+MgBHhhBe5n2U+3kRgAEoPUc/oc1vodRvO6I0OTIAM0IIWeEflTyvG4xmF9PDhBCWA1ha\npSUDOAFAlddnHkreG4BuvmbWC8CcEMI1AO5HaUA/BuAoM+tYbtPOzHrU/y8QxOMA1jOz71a9YWZD\nACwFcLSZtTKzDij9FTkBQBsAi8sPyL0AVPXX+wA2bthDF9Wg/mxmmNm2VTGOZQYCqHoYLjGzjVD9\nRNbzFEqTWJS9AP3L72+C0gNwuZl1AnBQXRy3WEVGH+ZVhH4YwA8pTmvH8vttALwVQvgCpedqK/rM\nMpScBpeZ2Z4oXSMdrBREDTNbx8x2WOMfU080V08PcyKA681sAwBzAJxcfv93AO4wsxNQugFXcTSA\n481sBYCFAC4JIbxrZj8H8IiZrQVgBYAfQOXDG4wQQjCzIwBcXXbHfoLSxPVslHTqKSj99fj/QggL\nzewWAA+Y2USUNOyXyvt5x8yeNrPpAEaFEM5v8B8j1J/Nk40AXGtmbVHylL8K4HsoPeSmodR/z1Ww\nn+sA/L0sa01GaVKLEMIUM3sBJS/gHABP1+nRCyC7Dw/JaH8pgKsBTC1PfOaV244AcLeZfQPAE3De\nmhDCIjM7FCVZ+hSUJsPXlENO1i7vc0Yd/q46Q8tQCCGEEKIQNGt5SwghhBCiUjTpEUIIIUQh0KRH\nCCGEEIVAkx4hhBBCFAJNeoQQQghRCDTpEUIIIUQhqFGdnvbt24eePXvW06HUjgULFkT7448/Tra1\na9cu2l988UW0y3WYIkuXLo12p06rKt+3adOmzo6ztsybNw9LlizJqmJba5piXxaBSZMmLQkhdKjr\n/ao/G54ijc1PP/002uutt94a74/v1a1bt85p2XDUx9hsin3JLFmyJHn9+efVL4K+1lqpf2TddVct\n3N62bds6P641Ja8vazTp6dmzJyZOnFg3R1VH/OhHqxZunTZtWrLthBNOiPYHH3wQ7bXXTn/2yJEj\no837O+SQrHpOKTyhAla/QNaEwYMH19m+mKbYl0XAzOql4KX6s+Fp6WNz5cpVq+/Mmzcv2ltttVWN\nPw8ArVqtKurL9+q+ffsm7fwfpQ1FfYzNptKXWfzlL39JXi9btizaPAHaaKONknZbbLFFtI844oj6\nObg1IK8vJW8JIYQQohA0i2UonnzyyeT1iBEjos2u1nfffTdpd9ZZZ0Wb/8rYYIMNkna77rprtO+4\n445o33///Um73/zmN9Fm6awuPTtCCNEUWLFiRbQ5jCDP08MV/vme63nzzTej3a9fv9oeYovEr5KQ\n5fny7dgzs8466yTb2OvGSoeXKrO+y7/P8uSBBx4Y7VGjRlX7eX98Xm1pSPS0FkIIIUQh0KRHCCGE\nEIVAkx4hhBBCFIImE9Pz8ssvJ69/+9vfRnvWrFnJtv79+0f7xRdfjLZPfWzfvn20OTXPZwtwynqe\n3nn22WdHe+utt4726aefnrTr2LEjhBCiObP++utH+69//Wu0fYrywIEDo52XeXXfffdF+w9/+EO0\nDzjggDU4ypZHXkwPZwr7WFIfx8OceeaZ0ebnWufOnZN2nIr+ySefRPuzzz5L2m288cbRnjx5cub3\nMvxszcvsq2/k6RFCCCFEIdCkRwghhBCFoN7lrTw31nXXXRftZ599Nmm34YYbRnvnnXdOtnGhJHbB\nvfTSS0k7lrtYcvLH9Nxzz0X7O9/5TrQ33XTTpN17770X7bfeeivap512WtLu+uuvjzZXeAby3ZNC\nCNFU4JT1MWPGRJvvl0AabnDyySdH+5JLLkna8b3ahxiIVfjnAvdDnoT10EMPRft3v/tdsm327NnR\n5nIrXo7s2rVrtLmsgH9m8udYjvNy2fnnnx9tDg9pSDnLo6euEEIIIQqBJj1CCCGEKAT1Lm/lubF4\n/ZXNN98883O+eiNnWx122GHRnjlzZtKOJairrroq2t7tuv/++1f7veyOBdJKzptsskm0/dpbt956\na7TPOeecZJskLSFEc4ClFL4/+0UpOazgBz/4QbQ5+wtIwwU6dKjzNXdbDP55kiVpHXvssclrXk3A\nr5XFzy6WpnhNSiB9ZjJ+MW8OHWHpixemBYALL7ww2ldeeWW0r7322qTdUUcdFW1/fdV19WY9gYUQ\nQghRCDTpEUIIIUQh0KRHCCGEEIWgwSsyczwOx8x4jZfbeY2Pq0G+/fbb0d5zzz2TdosWLYo2651b\nbrll0q5Pnz7R/vDDD6Ptq1By6iBrmj4e6fXXX492Y1aeFEKIuoDvd2+88Uayje/HXK3ZV7Tn+z2X\nJBGV88QTT0T73nvvTbb16NEj2vysAlZ/hlbhn3Hz5s2L9vbbbx9tH6uzbNmyaHPslo/j4n7mYzrl\nlFOSdlzVm1c7ANKU+LyK35UiT48QQgghCoEmPUIIIYQoBA0ub82dO7fa9316OLvTvCTE6XivvfZa\ntLliMpBWh2RJa+HChUk7dumxq9ZXU2bXGstW77//ftKOf8vy5cuTbVwNUwghmgMsdXB1XyA7pdq/\nz/dFrvzrqWs5o7mRV9bkT3/6U7T9c5ElLB9WweeUU+J9H/FrrsjspcqsPvKLpfIx8b79b+TSLg88\n8ECyra6vAXl6hBBCCFEINOkRQgghRCFocHmLI//Z3eklJ86I8rLViy++GG2OIvfVJDnjgNu98MIL\nSbv27dtHmzO5FixYkLRjlyFXsvTZW4xfBHXYsGGZbYUQorHw0gTLCpyF4yvkZkkdPjzgnXfeyfwu\nkQ2fq7Fjx0abqywDaXaUl4R4H9zOy1YsmbEMxlnNQPps5X3n9StLXbyiAQA89dRT0eaVGgCgX79+\nmfusDfL0CCGEEKIQaNIjhBBCiEKgSY8QQgghCkGjxvSwnug1Q9b/Nttss2Tb/Pnzo82Vm301SN5/\nx44do73ddtsl7TiVjvfh0+h79+4d7UcffTTafkVbjhGaMWNGsk0xPXWL15A5rqtLly7R9tfX73//\n+2ifeeaZ0faVYtddd93M7+YYL1XaFs2dvNTgV199Ndp5KdVcasSX8uD7ON/Da3IcReT222+P9rvv\nvhttHxfDMTj+HLZp0ybaH330UbR95Wa+p/G90K/8zv3Mz8y8WKK89/n1VVddlWy78cYbq91HbZGn\nRwghhBCFQJMeIYQQQhSCBpe3WDLitG92nwLAxx9/HO2ePXsm29hNytIUp0QCqfTFLj3vdu3Vq1e1\n+/OSBVdXHjduXLT79u2btNt///2j7X+XqIy89Nk5c+ZE++yzz07anX766dF+/vnno/2jH/0oaccu\n43//+9/RvvXWW5N2hxxySLR9SQROGf3e974XbS/HFjU9d8SIEdGePn165rY8il6dt6nAC1127949\n2cb3TC+DMNx/vpSHyOaZZ56JNj+TvDTFeFmen6f8OV+RmcNKePFYD49LlsS89Jk1fv0CqPy7xowZ\nk/m9dYE8PUIIIYQoBJr0CCGEEKIQaNIjhBBCiELQ4DE9vKQEpxFzfA+Qrorut2211VbR5rT0CRMm\nJO3efvvtaPMqwX5/rHGy9unLfPMx/e1vf4v2hRdemLTj+CGfKi0qIy9+g2Ow7r///sx2I0eOjPZ+\n++2XbONSApx+2a1bt6Td6NGjo+1LIjC+NH9jwtcwkJaMz2uXl57P5KXnP/jgg9HmVZq5ZAQAfPvb\n3472r3/962j78591HfhVpCs9PlE5r7zySrQ7dOgQbb90AcOp0b7v+LWPjxPZcGwix8z4WEEev/7c\ncywt38d8bE1WH/n9Zd0rPvvss8x2/F3+2Pma8s/dukaeHiGEEEIUAk16hBBCCFEIGtwnP2/evGiz\nS8u7pI877rho/+Y3v0m2cZodu/v8auycwr548eJoT5kyJWnXv3//aGe544A01Z3T6L07juWzoqYr\n1yePP/54tGfPnp1s43RaruTpq3Dfe++90ebKo96Nyy7e3XbbLdnG3/3AAw9E+/jjj0/a5Ukx9QFL\nR0BacXr48OHRzpK91gRORd95552j7d3hW2yxRbS5fICXwY444ohob7zxxtH29ws+x7Udc0qJT2FZ\nhWULf56yUqB9RXu+V7/++ut1dpwtHb7P8HXvr3MuF+BTx1l+z5OZuB3vw6fH+2dj1vdmtfP3RP5e\nH35S18jTI4QQQohCoEmPEEIIIQpBg8tbLBfwwpzLli1L2nFmyTbbbJNsY5cZV/b0keOcScCyGmeV\nAMBXvvKVaj/jF8Vj9zpXBfYVnjk63rvqOLOrvqPUv4wsGYDd19zGV1utNEuG+8u7SbP24eWQyy+/\nPNp87n1G1eabbx7tP/3pT9EeMmRI0o7P/d577x3tdu3aJe3Gjh0bbc4GBFLJ7O677462l7caIrPr\niy++iJmCXjrgDDe+/nwlcf7tfH58v7/22mvR/vvf/55s4/PP45vlPwA4/PDDo81j/6GHHkra8fjm\nrD2fjdejRw/UFO9iz5IHipoNNn78+Gjz+fDnje8RedlFfF/p3LlztH3V+q233rqWR9wyWbRoUbR5\nTFUqOQFpv3A/eKmS95G3gCnvj9v5Cs98jJXKx/ysBtKwFb/Iam2Qp0cIIYQQhUCTHiGEEEIUAk16\nhBBCCFEI6j3YwMfZ8GvWBX18C7/2cTEcA8Bavo8L4jR13seOO+6YtMta+d3HCbC2uNFGG0Xbx4Es\nWbIk2hzjAAALFy6MNscoNAaVaKx5bSqJCQLSmJZK41s43RxIdd5+/fpF21e85hXOOW7Ax7mcccYZ\n0WbNvE+fPkm7fffdN9peTz7llFOizdfaP//5z6Sdj/GpDz7++OPVVjKvgs/RLbfcEm0u1QCkcVRs\n+5iLadOmRduP79133z3anPJ8wAEHJO14fPN3HXjggUk7Pq+zZs2K9rhx45J2HF+1ww47RHvw4MFJ\nO64s7GN1ihq7kwVXLed7tY+343tm3kravI1jPbi0CKCYHg/fZ/ka9WOPz6+Pu8qL42E4PofjhzgW\n0L/mY/LxfwwfU147z8svvxxtH5tZG+TpEUIIIUQh0KRHCCGEEIWg3uUt7xrnKszs4ly+fHnSjqUJ\nL4mw25Sryvp9sNt0r732ija7yYFUjmK85MZuPD4GTmX3r73r1qe3NyY1rVxbW/c/n6vrr78+2fbC\nCy9Em92pJ510UtKO08pvvfXWaM+cOTNpx9fUsGHDMo/p//7v/6J9zjnnVHs8QCppcWkDIK3+zPbE\niRMzv7e+WLlyZZR333333WQbjx8eI/fcc0/SbtNNN402n0d/fQ8dOjTavXv3TraxDMIp8X6MsXuc\n5WF/7DwGs843kErPY8aMifZzzz2Xub+2bdsm21jO5srQXvLMW3CzJcGScl6len7N15qXX7I+wwub\nAsAuu+xS42NtSbzxxhuZ27LKidQVWSVKfJ/z/cFXYc6CP+Ml0rzfMnfu3GhL3hJCCCGEqBBNeoQQ\nQghRCOpd3vIZVVnyls8k4awnn3nDmVOcOeC/i12BvD/vTuXjYDebX8CUXXycBeLde1xNmo8VWF2C\na0xqusCizwhguYsX8PT9xZKRz8Q78cQToz169Oho+wVCuQozV0b2ixr6BSuz4N/OGUL+N7IM4zPK\n9t9//2hzP3vphSsY1xdrrbVWXDjVy7cnn3xytHmhXC8l8blk6cdXveZ2U6dOzTwmPideKmbpgzMa\nvdub5UX+jB+bLI36bMqsY+d+B9JK7Xy8//M//5O0q8rG8/eblgZft9tuu220fSVghseVD0vIqnjN\n2YAizVbKw2dAVSoz5cH3dA7N8KENfE/g48g7Jn5++nGel83FqzjUBfL0CCGEEKIQaNIjhBBCiEKg\nSY8QQgghCkG9x/R43Twr3dxXLmbd2MdtdOrUKdqffvpptL2GzO0ef/zxaPs0Z66MzGm7/nv5eFnj\n9PpkVuVRII33aSrkpQuy1pqXsj558uRo+/PGK++ef/75yTaujl0VkwIAL774YtKOY6g4RsgfO1dD\nPv300zOPl+E+mj9/frKNU7J93BmnfJ9wwgnRHjhwYNKuIWIWli1bFldT53IPQBqrwrEwviI4xyLx\nOOXPA+mY8zFQ/piq8LFsfE1wHFZeTA/jY4R4rPMx+bRpjkXwsUp8bvg3+9i33//+9wDSSt4tAd+X\nPI45NiMvFT1vZW6+bvhewjFdIo1fzMM/7/hemLcqel47huNv/bXB/Vzp92at9O735+EYzrpAnh4h\nhBBCFAJNeoQQQghRCOpd3vKVWLm6K8sKW265ZdKO5Q2f5sxuUpbFFixYkLRjKYnTWFmmAlJZhbf5\nKpRZlVi9hMXtvPzipZ/GpOrY8iQKdqH6NOHZs2dHm2UhXhAUSOXECy64INl2xx13VLv/bt26Je14\nIdEnnngi2r5CJ8upLGlyRWfPgAEDou0li6OPPjrahx56aLLtoIMOiva3vvWtaPuq2w3R559++mms\nfr7VVlsl27gyMi9K6ksLZKVs57me/TaWPtj27mweI+y+9mOMxyNLYh6+z/Ax+b5gyc1v4xR77jNf\n4qJqH3kyT3PES7sMXw9+kV/ulyzZw79mabEhSjo0J3xISBZ+TPF9vLbV87MqMvu+zFrA1D9LeDyz\nvOWfmXlj25fWWFPk6RFCCCFEIdCkRwghhBCFoN7lLZ+txG5N3sYVVYHUxdemTZtkG7uh2V3to9nZ\nDcsyWF4lWs4k8AsSZi0WWukCfEDTcolXuSX9MWbhXZz33XdftLmKqJdzOLOL5RUgrfrJMkdVJlIV\nZ599drSffPLJaP/qV79K2nH/XXrppdH28hZnE+VVcc7LLOFjYvyipT7rqz5Ya621ogTx7LPPJtv4\nmuPx569Frj7NWVN+bLLcnCeNsovdX2P8mt3ePnuLYRe4r3TO9wv+HV6S5WvTu9j5mHgM++u56pq7\n+OKLM4+1OfLSSy9lbsuTJrjPuJ2/Nvj+wec6b4HNIsJhAx4+v17C4udpnlyUB0taXbp0ibZfOJvH\nRF4Vbr7HcGa03x8fr9+HsreEEEIIIWqBJj1CCCGEKASa9AghhBCiENR7TI+H9WDWzX1czIwZM6Lt\nV4/l1xzT41P4WEPk7/V6J6fFcoqd10yzVob2lW0Zr09yvEFj8uGHH2LcuHEAgOuvvz7ZxjEbWbEX\nfhuXIvAplxwb5VfM5fiTUaNGRZvLEng4Pitv1XrWjXfZZZdkG+vf++23X7T5mgGA2267Ldo/+tGP\nkm3bbLNNtHfaaado+9TfP/zhD5nHWFd0794d1157bbQZLtfAqd0+podjM/g69TFw3Nd+3PI1wteH\nT23n88/j1sf08DFm7Ttvm19xPW/1eG7Lq4rz9cFcc8011b7fXKk0tsb3OZOX5sz3YO7zrFjJouLL\nvPD1zOfQjylul1dln7f5djyO+F6dV7Yib1zy/XmvvfaK9r///e+kHV9T/vns43/WFHl6hBBCCFEI\nNOkRQgghRCGod3nLS05Zaad+gclhw4ZFu0+fPsk2lpZYgvKpbexq4/RJn0rJLkN2x3nXH7ve2d3n\nUzj5mLwLPc813JC0bt06plKfeuqpyTY+j0uXLo22Lz/ArzmF0bfj8/Hzn/882cbnkdOkuQIzkKaB\ns0Rx3nnnJe14YdI8GezXv/51tLkysV+sk68Bv41lUa7q7a/XhujzVq1aRWnusssuq/fvEy0PLzNV\nKqvw9Z234CjDkkhTuSc2FXw/8DOTZecePXok7Vh2Hj9+fLKta9eu0ebQgbw+ytvGcJ/7EAi/mkIV\nPoyAJSwvkeWVxagN8vQIIYQQohBo0iOEEEKIQlDv8paXiNh1xe4zn2Xx/e9/P9pz5sxJtj3//PPR\nZjlj2rRpSTte6JL3791lLIOwK5EXYASAb3/729Heddddo+2lE38cjHf/NRZrrbVWlGR23333Rj6a\nhoczxYQQq2dvZS0k6u+fWTJIXtYt789nauZl2hYBL2+xjM6ZlAMHDkza8bPVV2Xn53CebMXt8mTH\nrAVH/b55fyxp9e7dO2n36KOPRttXgM/LHKsNTeMJLIQQQghRz2jSI4QQQohCoEmPEEIIIQpBg6es\nM6wN77bbbpntfMXjrArIw4cPz9xHnobMmmlt4LgiIP8317U+KYQQdYFfkZ6r0+dV+OWSFHmxIxzT\nkxfbyHErnTp1yjnilomPmcqKa+IKx0C6ioGn0lhS7jNOgfcrCdSmzACXIfFxOxzT448179qrDfL0\nCCGEEKIQaNIjhBBCiEJQ7/IWu0iBbOknLzXRu/sqdacy7DKrrZyV9V3sBvTH5+Usv8ijEEI0BXz1\n3ErTxbMWsPQLNmftz1e050Wkiyhv+WdmVkXiww8/PHk9efLkzH1mVdT2UlLWgrH+ucWf43Z5C0Xz\nigZ77LFHsu3yyy+Ptn+Oc6X+ukCeHiGEEEIUAk16hBBCCFEINOkRQgghRCGo95ieJUuWJK9ZW2TN\n16+sWims//nUtkpXia0U1jv52H1MD+uaflsRy6oLIZo+n3zySfK6apkaII0r8TEmfF/kVGYfm8L3\nTI4R2XLLLXOPo2hw7Itno402irZP+/7www+j7Z993EeVlk3h5TB87A9fA3nLUDAcm+OvDb5u/PFp\nlXUhhBBCiFqgSY8QQgghCkG9y1t51SXZpdW5c+c1/q5K5aw8GSwvBT5L3vIp8Czh+cqVXu4SQoim\nwNNPP528zrpX+fsdv2Zpxkv5WfdZL2e9/PLL0R4wYMCXHXaLg2VFIC0lkCf98fn28lFWNWxfVoD7\niJ93/lnIr3l/Pkxl/fXXjzZX/PbVvxl/7FzJuS6Qp0cIIYQQhUCTHiGEEEIUgnqXt3zUN0eEc+XN\nvAhtH83tXXI1JU8Gq03Gl3fp8W/xLl7vuhRCiKbA6aefnrzmKrmcbcX3cAB46623ot2uXbto+0rL\nLH3lLWa56aab1uSwWxwPPfRQ8pozoD/++OPMz7366qsV7T8vE49lR36u+eciP4M5hCNvcdCpU6dG\n+xe/+EXm99Y38vQIIYQQohBo0iOEEEKIQqBJjxBCCCEKQb3H9Jx88snJ60mTJkWbY3oGDRqUuY/a\nVmuua3x8UhU+3Z5f+2Nv27ZtnR+XEEKsKZdccknyul+/ftGeOXNmtH1cSe/evaM9cODAaPtYnQ02\n2CDanJZ+7LHH1u6AC4KvvJwFx0xxqjiQxpay7eOuOLaG95EX+8P4dhy71adPn8xjb0jk6RFCCCFE\nIdCkRwghhBCFwGqSKmZmbwOYX3+HI6qhRwihQ13vVH3ZaKg/Ww7qy5ZFnfen+rLRyOzLGk16hBBC\nCCGaK5K3hBBCCFEINOkRQgghRCFo9pMeM9vczG4zs9lmNtPMHjKz3l/+yWQfbc3sjPo6RrE6Znah\nmc0ws6lmNtnMdqmDfT5pZoPXtI34cuqj/2jfe5rZg3W1P1E7zGxluW9nmNkUMzvXzJr9M6PoFP2Z\n2TQK4NQSKy0Icg+Am0IIx5TfGwigE4BZNdhVWwBnABhRx4coqsHMhgI4BMBOIYRPzaw9gHW/5GOi\nidCU+8/M1g4hfP7lLUUFfBxCGAgAZtYRwK0A2gC4mBvpnDcf9Mxs/p6evQCsCCFcX/VGCGEygLFm\ndqWZTTezaWZ2NACY2UZm9piZPV9+//Dyx34DYKvyXzVXNvivKB6dASwJIXwKACGEJSGEN83sIjN7\nrtxvfy4P0CrvzG/NbIKZzTKz3cvvty7/xTLVzG4H0LrqC8zsOjObWP4r9VeN8SNbMFn9N8/MfkXj\nqw8AmNmGZnZDuW9fqBp3ZtbTzMaU2z9vZsP8F5nZkPJnepnZIDMbbWaTzOxhM+tcbvOkmV1mZqMB\n/KjhTkNxCCEsBvA9AGdaiZPM7E4zewDAIzl9vEN53E4uj9Ntym3/XfYeTa+6P4sGQc/MEEKz/Qfg\nLAD/W837RwL4L4BWKM1gX0PpRr02gE3KbdoDeBWAAegJYHpj/56i/AOwEYDJKP1lMQLA8PL77ajN\nzQAOLdtPAriqbB8M4NGyfS6AG8p2fwCfAxjM+ypfA08C6E/7GtzY56A5/8vpv3kAfli2zwDw17J9\nGYDjy3bb8uc2BLABgPXL728DYGLZ3hPAgwCGAZgEoDuAdQA8A6BDuc3R1PdPAhjR2Oelpf0D8EE1\n7y0t31NPAvA6jbOsPr4WwHHl99dF6Q+TIwH8hfbZprF/a1H+6ZkZmr2nJ4vdAPwrhLAyhLAIwGgA\nQ1DqrMvMbCqARwF0RamDRQMSQvgAwCCU/nJ8G8DtZnYSgL3MbLyZTQOwN4Ad6GMjy/9PQmnAAcAe\nAP5Z3udUAFOp/TfN7HkAL5T3s329/JgCktN/QPX9tD+An5jZZJQmKOtj1UTmL+X+vhNpH20H4M8o\nTXxfA7AtgL4A/lvez88BbEHtb6+r3ydyMbL/G0J4t2xn9fE4AD8zswtQqp3yMYBpAPYte293DyEs\nb7CjF1kU5pnZrGN6AMwAcFQ171s17wHAcQA6ABgUQlhhZvNQGpyigQkhrETp5vhk+aF3GkremsEh\nhAVm9kukffNp+f+VSK/b1QpNmdmWAH4MYEgIYamZ3Qj1c51STf+dWN5UXT8ZgCNDCC/zPsp9vAjA\nAJSk9k9o81so9dmOAN4s72NGCGFoxiF9uAY/R1SAmfVCqV8Xl9/ic15tHwN40czGA/gqgIfN7NQQ\nwuNmNgglr+3lZvZICOESiIag8M/M5u7peRzAemb23ao3zGwISi7Yo82slZl1QMkjMAGlILzF5c7b\nC0CP8sfeB7AxRINgZtua2Tb01kAAVTfLJWa2EaofmJ6nUBqUMLO+KE2aAGATlG7Iy82sE4CD6uK4\nRYmM/surOvswgB9SjNaO5ffbAHgrhPAFgBNQcq1XsQylB+VlZrYnStdHBysFUcPM1jEz9gSKeqR8\nH70ewB9DWetwVNvH5YnSnBDCNQDuB9DfzLoA+CiE8E8AvwOwU0P8BgFAz8zm7ekJIQQzOwLA1Wb2\nE5T+UpwH4GyU4g6moOQJ+H8hhIVmdguAB8xsIkoxCS+V9/OOmT1tZtMBjAohnN/gP6ZYbATgWjNr\ni1IczqsoSSXLUHJ9zwPwXAX7uQ7A38uu18koDVKEEKaY2Qso/VUzB8DTdXr0Iqv/DslofymAqwFM\nLT8U55XbjgBwt5l9A8ATcN6aEMIiMzsUwCgAp6A0Eb7GzNqgdO+6GqU+FvVD67JctQ5K/XwzgN9n\ntM3q46MBHG9mKwAsBHAJSrLJlWb2BYAVAL5ffz9BMHpmahkKIYQQQhSE5i5vCSGEEEJUhCY9Qggh\nhCgEmvQIIYQQohBo0iOEEEKIQqBJjxBCCCEKgSY9QgghhCgENarT0759+9CzZ896OpT65fPPVy0C\n3KpVq2RbuZ7Wavh0/qx29cm8efOwZMmSOv/ixurLlStXJq+XLl0a7U8//TTa666bLtrNfcE2fwYA\nPvlkVVHftdZaNaf3fd6p06pK6htssEFFx14XTJo0aUkIoUNd77epj81ly5YlrzfccMNof/jhqvI8\nfsx98cUX0eb+3HTTTev4CGtOSxubRac+xmZj9eVHH32UvF6wYEG027ZtG+127dol7dZZZ51q9/fZ\nZ58lrxctWhRtvgd37949aefv4w1FXl/WaNLTs2dPTJw4sW6OqoF59913o+0fcuuvv6qqNt90eaIE\nZF8Q9cngwYPrZb+N1ZfLl6fL7Nx5553RnjdvXrS7deuWtFuxYkW19qxZs5J2/HrjjVcVDPV9/uMf\n/zjaO+3UcAVhzSyvcnGtaepj8957701e77zzztGeMGFCtP3NlSex6623XrSPPrrxF+ZuaWOz6NTH\n2GysvvTfyfe7Qw89NNrHHHNM0q5Lly7V7u+1115LXl999dXRfvXVV6N97bXXJu0aa/Ke15fNoiLz\nFVdckbz+7W9/G+3OnTtHe/789HdutNFG0eabp/+rc5NNNqnW5ocrAOy5557Rvu222yo4cgEAU6ZM\nifb3vve9ZBv/BcLel+OOOy5pN3r06Gg/+OCD0T755JOTdvzQ5H7mv24A4Nxzz432qaeeGu1jjz02\naec9RC0ZnvCzh8V7ONnj4tltt92izX/1jR07Nmm39tqrbj38vfwXJAB8/PHH0T7ooFWridx6661J\nu0suWbV004ABA6LtxzD3Z97vEKIpwtfzH//4x2Tbf/7zn2j7ZyFPZq666qpo//rXv07asQeVx+hb\nb72VtNtuu+2izX9Q7rXXXkm7XXfdNdpf+9rXot2Yf7Ro1AshhBCiEGjSI4QQQohCoEmPEEIIIQpB\ns4jpYW0RAPbff/9oc0wIx/AAadAsx/S0b98+ade1a9do77LLLtGeO3du0q4hA16bG/fff3+0n3zy\nyWQbx+pwACsAbLbZZtHmjAOOAwLSWJG999472hysDKTXygcffBBtn6XAsWAPP/xwtJ966qmkXd++\nfaN91llnoSXDsTuVxjJx5hUAPPfcc9Fu3bp1tA8++OCk3dNPr1r4/vXXX482n28gDV7mmB4fI8RB\n8BzT0xjJB0LUJc8//3y0L7300mi/8847STu+9/ksKh7bO+ywQ7T5Hgmk8Tk8djbffPOkHSf/cMJP\nnz59knZLliyJ9ogRI6LtY4nGjBkT7TZt2qA+kadHCCGEEIVAkx4hhBBCFIJmIW9xATsgdX+xhOUL\nMnFtni233LLazwBpWizvo1+/fkm7hixi1xzgGjvsgmU5C0gL0bHkAQBDhgyJdocOq2pJsRwJpEUN\nOS198eLFSTuWRzgl2deZYNcwp1/6Yodz5syJ9n//+99k23777YfmRlaxPw+nqM6cOTPZxv35r3/9\nK9n2j3/8I9qcJst9C6T1O376059G20vZLF9yiu7jjz+etOP+5WPo379/0m7gwIEQojnxgx/8INos\nOflwDr53+RpzDEtdLB8D6fMvT+7m7+J2vs4WHy+XDfH7O+GEE6LNoRL1gTw9QgghhCgEmvQIIYQQ\nohA0C3nLV4RlaSKvcizLYJw15CURXrpg9uzZ1X4PAPTu3bsmh93i4Qycjh07Rtuvt8LypHd/ctv3\n3nsv2pzVBaSyGH8mz+3KEpnPZmDZhPfhZRh2ybYEeStP0jr77LOjzf00bNiwpB1XfvX9xNWVOeNj\nm222SdrxWGXJiSuiA8CoUaOq3d/777+ftGOJ+sQTT4y2z+7jirE33XRTss1X7RaiMZg8eXLymiUi\nln/zQgAqzb707Tgri/HPVv6uvHsKP5/5OeBDRTiLzMvp22+/feb+a4M8PUIIIYQoBJr0CCGEEKIQ\naNIjhBBCiELQLGJ6fBqxTzmvgleIBVLN86WXXoq21/l9HEEVHJ8ArB6PUjQ4bgJIY2tY4/UxU6wb\nc+wPkMaEcAyW15qz4i14JXUg1ZAZ38f8mo/X6+Q+Ponh67C+q4jWB/fdd1/ymtPzufr0+eefn7Tj\nkgRc3RVI47I4jsdfE8cdd1y0L7/88mj7Vda///3vR3vHHXeM9tZbb52041iEW265JdqjR49O2vEq\n1VzhGQAefPDBaPtYpZYCV8UFgDPOOKORjqR+4Xu3j0dp6vjK7zwuudSDvzfxPdM/u7KecTwe/Gs+\nbz5uh2OL+J7rn9X8zOTYJH5e+OO96667km0XXXRRtcdeW+TpEUIIIUQh0KRHCCGEEIWgWchbXAEW\nSNPb2G3npQ2WsVia8fJWVpqerwS96667VnjELZMXX3wxec2uS3at+kXseLFP72bNkrR8ZV52m3Jf\neneqd5tWd6z+exkvl7HM4SW3adOmRZsXRG0u+N/KFcx/+9vfRpsXZwXSqtd+UV4+J7ww7AMPPJC0\nO+ecc6LNqeheimAp7Zvf/Ga0vVzGi5ZyKYEXXnghaXfzzTdH25ekYLnvlFNOQXPCV+Dl8fPEE09E\nm/vYb/PXN8uTPIZ9Fd+mSHOTtBiuhg6ki/pyxeQ86T1v8e1KF+nme6tPMef7KS/g7avxe5mtCh8q\nwfv3C1ZL3hJCCCGEqAWa9AghhBCiEDQLeYtdq0DqGuvRo0e0vSuN5S524/nFJ7NkFS/TbLHFFjU5\n7BbH/Pnzk9cse3DFZC8bZC0652Fpyp973j/3l5c+s6qXLlmyJPN7+Zj8sfMx+cwzf4zNDS9bMdzX\nfpFOlpL8eeWq1UceeWS0H3vssaTdmWeeWe3+fX/+8pe/jDbfB7yMeccdd0R73LhxyGL48OHR3mqr\nrZJtr776aubnmiJ8f/NyMG/jBVi97DN9+vRo+3PPmXQsq3hJhOF7sL8fZ0lO/n3+LWxzZqDff9eu\nXZNtfIws/Q0aNChp953vfKfaY2poeAFdLztzSACfD3/e+HzMmDEj2caSJD8zu3XrlrTj7FWWkH04\nAN8fOFxk/PjxSTse23xv9mEJnB3mr68pU6ZEe8CAAVhT5OkRQgghRCHQpEcIIYQQhUCTHiGEEEIU\ngmYR08NaIJDqkLzyudeQJ06cWK3NKXZAWt2VtWtfhbLoqzD7tFiOq+jVq1e0n3vuuaTd22+/He0u\nXbok2zj1kWM2fIokfxevsO3TNlkP53Rz33ccj8P7ePnll5N2e+65Z7R9mmVenFBzYPbs2clr1tk5\nhsrHwHEswRtvvJFs43gBju/597//nbTjMc3xAv764PF96aWXRpuvKc/Xv/71aA8ZMiTZ9u1vfzva\nX/3qV5Nthx12WOY+myJ5q1vzSvMcp+bHAfelX7mexwW38/EYWWnJeVV3mbwVwfn+4EuS8H3cX4dc\nWZhLkrz55ptJu6YS08OlA3xVeL4v8rny/c9xPD52ic+Bf54yfN74WvGrHfD1wKVcfOzP2LFjoz10\n6NBo+xg07mcfK/noo49GWzE9QgghhBAVokmPEEIIIQpBs5C3vMubqz6y2867U9lFf8ghh0R7zJgx\nSTteNJHdh7zQm//eIuLd2nyuuI98yuGECROi7eUGTkNld3qey5vxlZb5cyxVelg+49RMXzHay2yM\nr27b3PDjgMfPwQcfHO277747acfjiqvFAmnfjxw5Mtq+3xnuQ64EDaT9wZLWXnvtlbTjhTOPOuqo\naF9xxRVJO5bIJk+enGx75JFHMo+xsciShIBUYpg3b16yjasps+TrZQWWbL18y2OJbe5/f4wsj/jv\nyvpMXvVkvib9PYHHt0+35+uSnxFeIvNjoLHg69I/Z6688spoP/vss9HmiudAKud7CYtlob/97W/R\n9uc+S2byx8T3ai59MXPmzKQdV1vnvvQLC/P+/Jg94IADUJfI0yOEEEKIQqBJjxBCCCEKgSY9Qggh\nhCgEzSKmx6fBsabM6Xxer2Ut+/jjj482l/wGUj2RtWG/GrtPdS8aPk2YNWSOi/HnjfvFr9DMabIc\nU+DjBji2hvVlX5qe23FMgY/v4aUUevbsGW2fLsk6tI/v8St9Nzf+9a9/Ja8PP/zwaH/3u9+NNsfm\nAMCsWbOiveWWWybbuA/5+rjwwguTdrwaO8cp+LgSXl7gG9/4RrTPPffcpB2nZd92223R9nE7nA7c\nv3//ZFuHDh3Q1PAxPZymzGPHx/TwGOH7oC/xwDEcPgWa43PyYuy4lEVeDA7Hj7Dtv5df8/3Cx5/w\n8fl4ToZ/oy+74dP0mwK8RIt/zfE4v/nNb5J2PBa5RAuQxi7xvdUv38Hb8u6fvBr7Qw89FG0f98mx\nZbx6/Kmnnpq0O++889BQyNMjhBBCiEKgSY8QQgghCkGzkLe8S5bdbuz+9PIWu4Y5rc67SXkf7DL1\neMmlaPgVyHmVapaL9thjj6Qdu5C93MCr8LKL2stgfO7Zher7PGuldt933I63+Xb8XV7eqjStvrmw\n8847R5srbPv0Uj4Pffr0SbY9/fTT0WZJmaUuIK3Uyv100EEHJe24iitffyyxAcCkSZOizXKZb8ep\n7t27d0+28ers/L2Nib++WfrhUhBcVR4ARo8eHW2+D3oZiK9vn2KetaJ33grZ/Bk/PrhdpdJU1ueB\ndKz6ezo/M7iy8NSpU5N2Xu5qLPJWp+ffzRWkvbzF91kvO/MYYxnXl/zg642lZn8/5mPMu4bmzp0b\n7Y4dO0Y7T87y10Nd32fl6RFCCCFEIdCkRwghhBCFoFnKW+xaYxenz6Zhl5x39zFZVXzr283W3PDn\ng6vv8rnffvvtk3YPP/xwtCvNlvAVRbkvOWOLs/eA1F2b5zb311QVXsLi4/VZaVn7aC7w4qBAurgq\nLxrr+4yzSV544YXMfbLkcuuttybtWD7irMjx48cn7bjqNWdx+n76+9//Hm12y/t2LIP47DXO7Goq\n8lZeRWbOhtlnn32SbVmVkb1cxtWKvdTD9zs+b17CYPklr9Jy3j6y2vHv8Meet/gmv+b95ck5jQkf\nY16fM3nSnL/38Tjg+6fvBz4O3r+vss8ZgXx/yFuYtFLq+zkrT48QQgghCoEmPUIIIYQoBJr0CCGE\nEKIQNIuYHl8JmbV3junx6eYcc8EVSn0sBqfcZa0sDNROn2xJeJ04K2bG679cLsDHh2TFxXitnWOt\neFveKuisSfu+zCo/4CuUvvTSS9H2x87xARx74uOMmhKcuupXaeaVmH/wgx9Em1NNgXTlZJ/KeuCB\nB0abz4lfjZ3LGtx///3R5hRXAHjllVeiffTRR0f71VdfTdotXbo02ieddFK0OX0dSGP7OC0fAP70\npz9F+/zzz482l2ZoaGp7z8mK9/CxL3lxIbzNf47h8ch2paus5+07KzbH7yNvW6UryTcVKu1zf+/M\n+xz3ZV78ELfj/vPjl++TedcQ3x/8cyELf0x1/dyVp0cIIYQQhUCTHiGEEEIUgmYhb/n0ZZY6WKbw\ni6JxJU5mk002SV6zLMZyiXfd82KkRSQvpZUlxzzJyfcRS2S8jzw5il3UXmLja4W3+VRVhtMvu3Xr\nlmzLS1nnY+J9NGV5i+Uif475+r7pppuifeyxxybtlixZEm1eBBRIpSCWoA444ICkHaefc58NGzYs\naTdq1Khqj9eXp+C+OOecc6LNFWwB4MUXX4z2WWedlWzj6s38vX4ByKYC39+8tM+SJMv3/l6aFSoA\nZFcJzpMbWBLx9wuWvupCsuBj8lJalszmF6/29/imQF3IO3mp+Lz/PGkqq+oykN4rtt1228zv5Xtw\npan49R1GIk+PEEIIIQqBJj1CCCGEKATNQt7yLsmsyHzvgvMyVhXexcvkLWBaRDjbx8PZA1zZ1Ufp\ns4uTs+g8WYvdeTijw2dUsQTFckje/vx1w/CxeymNr5VKF01sbFgWYqknj69//evJa5byuN+BNGOL\n3dR+LLI0w5khnEEGpNcSZwHygqVAWgmWpR5/7A888EC0vbu9f//+0ebf2BhUHVteZtPw4cOjfeed\ndybbeBxwP+TJD3kLieZJDtwuqzqzf81j00tTWVJa3kKc/p7O4z3re6v77uaEvzYqlY+YSp9xvv+z\nJFMf2sDVnyuVrZS9JYQQQghRB2jSI4QQQohCoEmPEEIIIQpBsxA0vQ7LaXasSfr0w0pXwc6K2+B0\nzqLyzjvvRDtP/86rbOrTi7M+xxp1Xt/lHQfHh1R6vHmrCeeltObFAjVVunfvHm2v50+aNCnaO+64\nY7QPPfTQpN29994b7UWLFiXbOO2bz6WPy2M4/dVXSeY4nt69e0d7woQJSTv+Lbfccku0zzjjjKQd\np9H7Mgac5u1jixqaqjgGH8/A9yeO2+nTp0/S7rjjjos2V6H2sWdZcY9AOgYrjelhfBo9n2/eX13E\nbPjYMh63nIrvx3NTrMhcKbU9dh4reXFA3C958UPcr83hfMrTI4QQQohCoEmPEEIIIQpBs5C3vGtt\ns802i7Z3azJZ8oZ363M7dsk2xWqdjYl3XfJ5zCsDsGDBgmj37Nkz2VbXqd61SUFlGTOrijeweno8\nyyG+0nRT5e233472uHHjkm3jx4+PNp8TPw6WL18e7ZEjRybbWPJjWYGrtgLAk08+GW0+51/72teS\ndpw6Pnv27GgPHDgwacdSGkuyvsL2G2+8EW1fjoGrRvvrtKGpkg+89JMlH+y0007J60MOOSTaLAX6\nCrx83fqK47ytUgmKj8+Pbb5f8D29tqVBWFbx44+3sUy3cOHCzONtblR6fmsLS1i+j7LKGeTJZU2l\nBIw8PUIIIYQoBJr0CCGEEKIQNAt5y0snLGGwvOWzabJcsl4CYdc4u+3yFqksIt4VzC7lPCmQq6P6\n7KhKqQt3bRbsJvZZY1tssUW0p02blrmP5nKt8LXvz+nf/va3aL/11luZ++DfytlQfv9cJZmlKSDN\n8mIJyy/q26FDh2izNOWzlVjuevDBBzOPneXrf//738m2P/zhD9Fu37595j4agjXNaOLK51wVl6s4\nA8DcuXOj/frrryfb+NznVTfPqkzu5YxKs4aYvCyvvAxPPt686s95FeebOv4c5slbWefeS2RrKvf5\nPuf9NZV7pDw9QgghhCgEmvQIIYQQohBo0iOEEEKIQtAsYno8rOVyCu78+fOTdl26dKn28163nDNn\nTrQ5zZa18KLC+r0/b/w6L6aH02Tz4oJYX65U//Xfm5UC7+MQWPPmOA8fy5KXis80l+rMPHa6du2a\nbHvhhRei7VPMmaVLl0bbn2+uiM2rlj/zzDNJOz7PQ4cOjbaPU+AxyPEXfmzzuOXx7OHv9VWiOWbr\nsMMOy9xHQ1JpzAWn6QNp7Apfw1x1GwCGDBkSbZ/eP3Xq1Ghzv/i4Kz5GbufjSviY8tKhs35jXryJ\nP09Z9yNfiqBHjx6Z+2zq5K1G7uOf+BzzNn9Os/oyb3955Qf4c02lrIc8PUIIIYQoBJr0CCGEEKIQ\nNEt5ixewvP/++6PNrnUgO+3TL4THblx2i3LKbVFhaSpvsb68ysosI/h2WSUC/CKlfBycSuvlJ+7b\nvGrdDF83XG0YqLzCc1NJx/wyuNJyu3btkm0sWwwbNixzH5xivmTJkmQbp/jz+feL93KqNF8ffmxy\navsrr7wS7W222SZpd/PNN0ebU9ZHjBiRtOMx7SUhluD23nvvaj/T0ORJOlxdmBdmBdIFRzl93ffX\nPffcE22/yOrWW28d7Xnz5kXbnzeu5MzjMU/e4m01kWmYPPklC39MlS5K3ZBUms7vqVT+qzQtvdJz\nXym+GnhjIU+PEEIIIQqBJj1CCCGEKATNUt7iTBzO6PDusyx3mpc92F3Lrj+f3VJEfNYTw9H4eS5T\nzpjwixoy7GrO2x/3X6USls8q4O/i/vcVo/nYp0yZkmxjl3FdL5xaX7B7nzOegFQ2PPXUUzP3MXny\n5GizBAIAvXv3jjaPH+8O50VGWep67bXXknYPP/xwtLlKspdYWObeYIMNMo99xx13jPa9996bbOOK\nz03FFe+zCf/zn/9EmyVVX/H6ggsuiPaiRYui7TOvWI7038XnmKVQf25YZsuTnLLkrdpWn86Tnvn+\nwZl+nKEINJ2MIqbS8+GlYKa2kmFWv+RVeM6Dj6OpSIny9AghhBCiEGjSI4QQQohCoEmPEEIIIQpB\ns4zpYbIqTQLZcRZe7+RquqwT52mmReH999+Pttdk+fzkVS7mGAsfU9C9e/dos77uU8BZU+Z9+LgU\nPsa8FcWz4j78b+TYk7xYkeYS08NxMRyLAQAnn3xytLlSr4fj6Hxqe79+/aLNfTtmzJik3a677hpt\njkd59NFHk3Ycj8KVm3lFeCAd03lxGrw6u48L4jIJM2fOjLZfmby++eijj2Lsyfjx45NtBx54YLR9\ndWHmxBNPjDavnu6rzHM6PsdWAcBLL70UbY73GjRoUNLuzDPPjDaPRx8PyPdn7q/apmjn7YNjevg+\n5a+NLbfcslbf3RTIi6vx2/LuhUyl97G8fWRRaXX7+kaeHiGEEEIUAk16hBBCCFEImqW8xTIDu6i9\nyy3Lze0lDHbpcdXhpuKOa0zY5e/PW1aVZA9LCgMHDky29e3bt6LjYBc9p9nWBTvttFO0vXuXf79f\nVJTbNpUU5y+Dqx9vvvnmyTbuw7ySAfw5Lw9wX/P14q+d7bffPtq8QOj06dOTdlwZ+c0334z2iy++\nmLTjvsgrs8BVh70kwsfBCxk3NBtssEFMrecU+5pwyCGH1OUh1QovPYs1h6vH+7GXNway5CgfElJp\nteYs8u6DtS1NUNfI0yOEEEKIQqBJjxBCCCEKgSY9QgghhCgEzTKmh9PvWIP0Gn1WmXKfNs1aKKf6\nVbrCdlHwujDHuHBqt4dTa999993MdrzNa8scw8ExXXnxWXnHx5/jlaY5vgdI44d8vM+a6t+NAaeH\nb7LJJsk2jo/L09+zSvwDaUzcE088Ee1XX301abfvvvtGm8dtr169knbcb3/84x+j7VOvOdYhL06M\n+9PfB/g3c2zRUUcdlbk/IRoLn5bOr/29ujbxqby/vLICWaUIPE1lyQ95eoQQQghRCDTpEUIIIUQh\naJb6TZa84eWorOqSPq2OXX9s56W+FgVO2fbng92feZU8d95552iffvrpyTaWM/JS0VnmYHnFr4pe\nKZdeemm0R44cGW2/CjOvPM3VgQGgc+fO0a501eHGZtasWdHeYYcdMtstWLAg2t26dUu28RgZNWpU\nso3PCa+YzpV/gTQ9nis8e3mR5TiuGLz++usn7XgbX7MeluN8SjX3L6fUC9FUyFv5nO/BfnwwLEH5\nZyHfW9nOu7/zcfhj4v1rlXUhhBBCiAZEkx4hhBBCFIJmKW/NmDEj2uwa92RJDkuWLEle+0yQKrhC\na1HhbBx/rtnl2bFjx8x9sFuTFycEgJ///OfRPvXUU6Ptpcp58+ZFe9myZdUeH5BW7Z0/f360vTTF\n+/eSFsMLn/oKqPzdvlpzU4Uz5HhBSSD9PaecckrmPm666aZoX3LJJcm2559/Ptp8vvzimKNHj672\nmLjyNpDKbCxx7rPPPkm7ww8/PNrf/e53M4+d8RktLGlx5WohGpK8DCje9sknnyTb+F7l98HXel7G\nc55UxbBslfe93I7v242JPD1CCCGEKASa9AghhBCiEGjSI4QQQohC0CxjekaMGBHthx9+ONo+vuPE\nE0+s9vNXXHFF8vr222+PNsepHHnkkWt0nC2Brl27RpvjK4A0PZntPHxK8o033hhtjufw6fEcM8Rp\nzL7KJ+vVPXr0iPaBBx6YtOPVtvPISpX3350X09SUOPfcc6u1awKnw1522WVrfEzXXnvtGu+jNviY\nJiGaAnnV0LmK+Oabb55sW7FiRbR9KQ8uvZFXiiUrnd3HLGZ9l0+V33jjjaPtq7c3FvL0CCGEEKIQ\naNIjhBBCiEJgeelxqzU2exvA/C9tKOqSHiGEDl/erGaoLxsN9WfLQX3Zsqjz/lRfNhqZfVmjSY8Q\nQgghRHNF8pYQQgghCoEmPUIIIYQoBE160mNmm5nZ5PK/hWb2Br1uGku2ioTa9pmZ9TSz6RnbLjGz\nfTO2nWRmXdx7x5rZhWa2p5kNW7NfJGqLmW1uZreZ2Wwzm2lmD5lZ7xruo62ZnVFfxyhSyuNmhplN\nLY/ZXepgn0+a2eA1bSNWofts7WnSdXpCCO8AGAgAZvZLAB+EEH5Xtd3M1g4hfF79p+seM2sVQlj5\n5S2Ly5f1WS33eVF175tZKwAnAZgO4E3adCCAawAcCuADAM+syfeLmmOlYiP3ALgphHBM+b2BADoB\nmFWDXbUFcAaAEV/STqwhZjYUwCEAdgohfGpm7QHoj8smiO6ztadJe3qqw8xuNLPfm9kTAH5rZgPN\n7NnyXyb3mNmm5XbxLwcza29m88r2DmY2oTwjnmpm25TfP57e/1O5o2FmH5RnwOMBDG2UH93CyOoD\nAK3M7C/lvzQfMbPW5fY3mtlRZXuemV1kZmMBHAtgMIBbyvtqXX7YDgTwLoDTAZxT3ra7mfUws8fK\n3/mYmXWn/V9vZmPMbJaZHdLAp6QlsheAFSGE66veCCFMBjDWzK40s+lmNs3MjgYAM9uo3CfPl9+v\nWkH0NwC2KvfhlQ3+K4pFZwBLQgifAkAIYUkI4c3yeHuu3Gd/Lo+xqnvsb8tjeZaZ7V5+v3XZwzfV\nzG4H0LrqC8zsOjObWB7jv2qMH1kUdJ+tnmY36SnTG8C+IYTzAPwDwAUhhP4ApgG4+Es+ezqAP4QQ\nBqLUka+b2XYAjgbwlfL7KwEcV26/IYDpIYRdQghj6/yXFJPV+qD8/jYA/i+EsAOAZQCySmJ/EkLY\nLYTwTwATARwXQhgYQvgYwI4ApoQQ5gK4HsD/lreNAfBHAP8oXyu3oPRXShU9AQwH8FUA15tZWlpU\n1JS+ACZV8/7XUbpZDgCwL4ArzawzgE8AHBFC2AmlCdNV5RvrTwDMLvfh+Q1y5MXlEQDdyg+kEWY2\nvPz+H0MIQ0IIfVGawPDDau0Qws4Azsaqe+/3AXxUHme/BjCI2l8YQhgMoD+A4WbWvx5/T9HRfbYa\nmuuk584QwkozawOgbQhhdPn9mwDs8SWfHQfgZ2Z2AUq5/B8D2AelgfmcmU0uv+5Vbr8SwN11/QMK\nTnV9AABzy94AoPTA7Jnx+dsz3gdKLtdRGduGAri1bN8MYDfadkcI4YsQwisA5gDok/sLRG3ZDcC/\nQggrQwiLAIwGMASAAbjMzKYCeBRAV5SkMNFAhBA+QOk++D0AbwO43cxOArCXmY03s2kA9gawA31s\nZPl/Hq97APhneZ9TAUyl9t80s+cBvFDez/b18mMEoPtstTTXSc+HFbT5HKt+X5xNhhBuBXAYgI8B\nPGxme6N0w72pPFMdGELYNoTwy/JHPlEcz5phZkfYqiC7wRl9AAC8wMtKZMec5fX//ij9xVoJIcOu\n7rWoGTOQ/oVfRdbCQscB6ABgUPkv00WgcSsahvJk9MkQwsUAzkSpX0YAOCqE0A/AX5D2S9WY9eN1\ntfFjZlsC+DGAfcpegH9DfVxn6D5bGc110gMACCEsB7C0SksGcAJKfzkCwDysuukeVfUZM+sFYE4I\n4RoA96PkZn0MwFFm1rHcpp2ZrVqtUqwRIYR7aEI5MaMPasv7ADYGgLLnb+1ykF+yrcwzAI4p28cB\nYLnyG2a2lplthZKX7+U1OCYBPA5gPTP7btUbZjYEwFIAR5tZKzPrgJJXYAKANgAWhxBWmNleAKrG\nn+9DUU+Y2bYU9wGUZMiqcbDEzDYC3UtzeArl8AAz64tV43sTlB6ky82sE4CD6uK4RQndZyujSWdv\nVciJKGmDG6DkLju5/P7vANxhZiegdAOu4mgAx5vZCgALAVwSQnjXzH4O4BEzWwvACgA/gMqH1xer\n9QFKN8TacCNK/f8xgKtQkkaqeADAXeWg2B8COAvADWZ2Pkru+5Op7csoTZg7ATg9hPBJLY9HAAgh\nBDM7AsDVZvYTlGJ25qEU+7ERgCko/ZX3/0IIC83sFgAPmNlEAJMBvFTezztm9rSV0mxHKa6nXtkI\nwLVm1hYlT/mrKEldy1CKl5wH4LkK9nMdgL+XpcrJKE1qEUKYYmYvoOQFnAPg6To9euHRfbYatAyF\naDGY2V8B/DWE8GwNP3cjgAdDCHfVy4EJIUQLobnfZ1uCp0cIAEAI4dTGPgYhhGjJNPf7rDw9Qggh\nhCgEzTqQWQghhBCiUjTpEUIIIUQh0KRHCCGEEIVAkx4hhBBCFIIaZW+1b98+9OzZs54OJWXFihXR\nfu2115JtH330UbWfadWqVfJ63XVXLRD84YeriktutNFGSbsvvvjiS/cNAJtuumm0u3fvntmuLpk3\nbx6WLFmSVcW21jRkX4pVTJo0aUkIoUNd71f92fBobLYs6mNsNpW+XLBgQbRXrlxZrQ0An366qljz\n2muvmh7ws9R/rrREXolevXqhKZDXlzWa9PTs2RMTJ06sm6P6EhYuXBjtM844I9n2wgsvRJuzz3hS\nAgBdu3aN9vjx46O92267Je14ojN58uTMYzryyFXrso0YMSKzXV0yePDgetlvQ/alWIWZ1UvBS/Vn\nw6Ox2bKoj7HZVPrynHPOifZ7770X7XfffTdpN2/evGjz87Rbt25Juw8++CDa7Gy444471vhY64K8\nvpS8JYQQQohCUC/FCdn7wq4vz+LFi6N95ZVXJtv+9Kc/RZs9NkAqd3XqtGoh5s8//zxpt++++0b7\nwQcfjLafed9++6rFZGfMmBHtJUuWJO34c/369Yv2kCFDknZXXXVVtL33SQghhKhPli9fnrx+4403\nor3xxquWyWrTpk3SjuWpt956K9rLli1L2n3yyarVI559dlVhZg4jAYANN9ywBkfdMMjTI4QQQohC\noEmPEEIIIQqBJj1CCCGEKAR1FtNTaRwPR5HfdtttmZ9hbXGDDTZItrFOyKl0Pqbnsssui/ZDDz0U\n7f/+979Juw4dVmW28b5Z+wRWT4mv4t57701ejxkzJtp33nlnsm3gwIHV7kMIIYSoC3yZl0WLFkWb\nn6frrbde0m6dddaJNj/vuIQMkD6vO3bsGO2XXnopaTdo0KCaHHaDIE+PEEIIIQqBJj1CCCGEKAR1\nJm9lSVo/+9nPktejRo2KNldG9lIS47d16dIl2ptsskm027Ztm7Tbaqutov3OO+9Ee/jw4Um7uXPn\nRptT84YOHZq0421cnMmnpbNEdtZZZyXbbrnllmhzwSeWB4F8iVAIIYTIgp93QCpptWvXLtp+BQKu\nyMzP3aVLlybt+LnLYSVPP/100k7ylhBCCCFEI6FJjxBCCCEKQa3lLb9QGUd6v/nmm9G+5557kna8\nUCdXZPbyDi8C6l1rHC3+/vvvR/vjjz9O2rF8xu44n4XFUtIee+xR7fEB6QJsm2++ebR9tUqW3/hc\nAMB3vvOdaD/yyCPVHoMQQghRW/zzlKUq3saVlQFgrbVW+UH4Gezh5z8/j/3+miLy9AghhBCiEGjS\nI4QQQohCoEmPEEIIIQpBrWN6fPVjjpP561//uuoL1k6/gmNXOD1utQOjz3F6uP8cp4d7HfP555+P\nNsfg+JVgOc6I09c5XghIfzNXqOR4ISDVO306O++Tf4evjCmEEELUBr/KOj8b+dnln8GtW7eO9rx5\n86K9/vrrJ+14dXaOA/LP1qaIPD1CCCGEKASa9AghhBCiENRa3sqTY8aNGxdtXsAMSGUwloVYpgKA\nt99+O9rscgNSmYnT2X3lZnbBsVy22WabJe04NY8/w5UrgdSNt2DBgmj7VHn+zf48caXM3//+99H+\n6U9/CiGEEGJN8anj/jlchZe3XnzxxWi/9dZb0d5nn32SdvxM9qEuTR15eoQQQghRCDTpEUIIIUQh\nqLMFRydPnhxtdq35RUBZZpozZ060+/Tpk7RjCcq7z1jG4m2+SjRXZPZVmLPgfXhpijPPtthii2j7\nDDV2Jfpo9vbt20d7zJgx0Za8JYRoTPKq7NeGb37zm8lrllK+8Y1vRHvgwIFJu549e0abZZS84+Fw\nAyB9Hs2aNSva5513Xu4xtxT8ueIQDu6Hl156KWk3ZMiQaA8YMCDaPpM5K7xlTa+ZhkCeHiGEEEIU\nAk16hBBCCFEINOkRQgghRCGos5iekSNHRpt12HXXXTdpx3pit27dor3BBhsk7T777LPMfbBuyDE9\nPi2Pq1L6/TNcoZIrV3700UdJO94/f6/XMXmbr8jMv4u/V9SORx99NHm92267RZuriPpq3bVZ1b6u\nYx6aK3wu/Xnlsg5ZnwHS88/xBlmfB4DHHnss2sOGDUu28T3Hf1fWcfhrgLflHUdLJe8387Wfd93z\nPd2X/Nhyyy2jfdVVV2Xug/uBx7AvDcLPBZ96zXGWHO8zePDgpN3w4cMzj6M54/vovffeizY/d7jq\nMpDGPPG4/POf/5y0y4q5zVuZvalQvJEthBBCiEKiSY8QQgghCkGdyVuvv/56tDll3S9UxpITuzvf\neOONpB27OP2Cnm+++Wa0OS3dt/Pu1axj4qrO7Mb1n2e5i92HXvbg174yJi+eumzZsmj7lEBfXbpo\n3HzzzclrTu/nat1PP/100u6mm26K9kEHHRTt2shZnjy3fp5s0tLg35cnEeXJTPw5llWeeOKJpN3l\nl18ebR5XXt669NJLo80u9kr7zB8HV0v/4Q9/mLTLqm7b3PF9WamkdeSRR0b7iCOOiLYPKXjooYei\nzfc7fz/mEABm8eLFyWs+Ji/NsdzJco5fiLOlkidVsizor2VeGYFT1nk8ANnPP/8MborI0yOEEEKI\nQqBJjxBCCCEKgSY9QgghhCgEdRbTw7BOyPEyQBonw7our24O5Kexsu7IqYq8gjmQXcLcp4pzOl+n\nTp2i/e677ybt+BhZX/a6aOfOnaPt0yx5+Qr+HS+//HLSzqdWFo0RI0YkrxcuXBjtnXfeOdrXXntt\n0o5jgaZMmRLtM844I2lXG+3Zl7rneBOO1frHP/5R4303V/LidvLiCsaPHx/tY445Jtq+xAP3U8eO\nHaPty+czefEnefFI99xzT7R/9atfVfu9AHD88cdn7r8lkXUeTznllOT1j3/842gPHTo0c39nnXVW\ntKdNmxbt6dOnJ+14G8c9brfddkk7jgXyMZD7779/tDlObOrUqUm7ww47LPN4mzO+zAvDzytflqVr\n167R5vPtU9H5mckp63mlYZoK8vQIIYQQohBo0iOEEEKIQlBn8tYrr7wS7Q4dOkTbSz+cbs4rjvuK\nmiwLeTc0uzVZIvLp4ZyeyMfBshcA9OjRI9qcLumPnVOls1JuAeCtt96K9rbbbpts49/M1TCXLFmC\nhqZKmvAp9wy7uL2Uwa9rW534ueeei/Z1110Xbb/yMq9Wv2jRomj764b3ceCBB0b74YcfTtqxbLL3\n3ntH27vJ77vvvmj7fma5k68Hn3a91157oaWSl57PKcpnn312Zru8Cu58H+BrbObMmUm7v//979E+\n+eSTo81ueCC93nw/scS8ww47RNvLlS1J3qq01MK5554b7VGjRiXbbrjhhoq+q0uXLtXaBxxwQEWf\nry18n73llluSbT//+c/r9bsbCy9H8bMxr4Jyv379qt2ff2by5/gakrwlhBBCCNFE0KRHCCGEEIWg\nzuQtlpI4e2m99dZL2nH2EmdH+cyrPDcZSxrskvWucZYqeH9eBstaZJTdgH4f/L1eEtl8882j7X8X\ny3Es0zVGpdCq38CL81XSvqY888wz0X7yySeTbRdeeGG0d9lll2gfe+yxSbsXX3wx2twPPnuL5ZBt\nttkm2iwrAunCiCxh+aqvvICpzzDkfmf57YEHHkjaNZS8VeVyzsuaqs1inDXp92uuuSbaLGn17ds3\nacfjgLM4/TjgbEqWn/jzAPCXv/wl2rw44uzZs5N2fO/wEiqPW76XcKYZsCqTsDEWDGYpmu286tJ5\nFaTz+vbiiy+ONmfLcRX8So+1umOswkvjlV5vvADtP//5z2TbbbfdFu22bdtG22cH+gzdlkJen7M0\n5eUtPleMv/dx3/K4aQ4L9Tb9IxRCCCGEqAM06RFCCCFEIdCkRwghhBCFoM5ieljnZe2WYx2AND18\nzz33jDZr8kAa+9OzZ89kG1e/5f37+CHWivkzfvV03sfWW28dbR/rwvt47bXXoj1kyJCkHafsjxw5\nMtm22WabRZtji+bOnYvGwlcp5bgH/m0+PoljHTjlnlPKgbS68oknnphs+/Wvf13RPlgr5jgPn0rJ\nq6xnVdD2r88888xoz5gxI2nHMSH+uuHjZZuvoYakatx5nT4rlsLr71l6/OTJk5PXvOIyjwMgXfX+\nK1/5SrT9fYDHJvfnnDlzknZvvPFGtDlOIe83cvyWrx7Mx+urxfM44Hg7H7tTdY342MCGgM9bbctE\nMHy+f/rTnybbuPQGV+rl6uhAOgZ5m48DqQ1cYoBXcwfS+/N+++2XbOM0eB7DPl4or7J3c8Zfs/7Z\nWIVf4T4LHy/r412ryIsZbCrI0yOEEEKIQqBJjxBCCCEKQa3lLe8+Y/cXu6F9Ciq7lLkSrk+xy1sg\nlBeh49RV765mNze7Qlmm8vA2/728D3bv+UXbWAbJSx3kY/cyTUNQ5Yr85S9/mbzPv5tTPH3VaD5m\nXjTSSx68+B+nJwNpijmnQXqZiWVBvja8m3WrrbaKNksjAwYMSNrxMZ533nnR9um4vXr1iraXyNg1\nzOn2F1xwARqDNZU7uPLw//3f/0Xby148/rx7fPfdd6/2ePyYY/nSy4ZZ8LXiF/LlMhks2YwbNy5p\nx9/lU9aHDRsWbV4E098Hqhae5ertDQXf43iM+GPhcTtmzJhoz5o1K2nHla1ZwgKAQYMGRXv+/PnR\n9mOJ98kpz6eddlrS7vrrr0cl8Lnne4e/1vj+eddddyXbuJwEV1/nqvrA6lJdS8FfDyzF8rjMW5iU\n8aEeWWUFalvWpCGRp0cIIYQQhUCTHiGEEEIUglrLW17q4Eh9jhT37rOOHTtGm12wXgZi2cJXzeTs\nIv5edmMCqUuP3XM+84P3wcfrM074d7Fs5/fHv4UzIIA0E41/f0NXBv3www/x7LPPAlg9C4VdwFmL\nsQKpxPDoo49W+3kA6NOnT7TvvffeZBtnZLBb22cBcBbZHXfcEe2DDjooaff6669Hu0qGAFa/Xnkh\nysMPPzzaO+20U9KOswp9NdcJEyZEmzPU+BiAhpcuWX4C0sVW+Xc//vjjSTuucMtjYrvttkvacTVr\nf15ZxuJrx8tbfM2xZOMrLbMMxlle/jdy9iOPYT5Wf0z+3sTngyUtL6FWSTh1kT31ZfhKw7/73e+i\nzXKcrwrPC0Dz8bNcCwA/+MEPos2LrALpdcP78+eNsxX5/n7PPfck7XhRWF7o08vBBx98cLRZLvHV\ngvn+wxmzQHqvZlnMP2f4HLYkvCTN547PW6UZdv76yqpG7p+FTRF5eoQQQghRCDTpEUIIIUQh0KRH\nCCGEEIWg1jE9Pt6FNT6282JVOAXYp33npXNz7A7rk/67WNfl/XH6M5Ad++NhnZT351cT5t/iYxRY\nX+Z4iLw0+vpgxYoVMd6od+/eyTbWb1mj5dXNgbT/OO6jX79+SbvRo0dHm1dLB9JzxXEaW265ZdKO\n42541XZ/3jje4NBDD402pzQDwLbbbhttjknwmj9fK/PmzUu27bvvvtHmlFmOJQKAr33ta2hIzj33\n3OQ1p3Dzaue+wjZXUOZ0bp8qzOnLXtvn11yewMdccKwOjx+OOfL74+vNlxbgccvf68cmx/74uCCO\n0eEx4O8JVXExWVVu65IHH3wwec33IP4tebFLHLfi4+043dyns3M1ZL5W/L2fzyPH4vkYOE6Jv//+\n+6N99913J+34vPp7NcPXpU+V5uuGYxH9sfuYy5aCj4nk88Pnw98Xs+CV1IH0HpD1PU0VeXqEEEII\nUQg06RFCCCFEIai1vOVT09jVyJKOd5+xm9unmDPsXvYSRl6VY4ZdwezS45R3ILv6s0/7433w8fnU\n1e7du0ebKwT7fXLFaO8Krm/atGkTU0OvuOKKZBunnbKk4yvnbr/99tFmaZEX+PPtfBorXyvshvdp\nwlXp9UAqn40aNSppx25dvtb8sXNFZk7H9bIJu7/5dwBpGii7e31F6kmTJqG++eCDD2LlXV+CIKtK\ntZc6ONWefw9fp0DaZ/665bGVJ//4FNisz/B4YdunTXO/s4zi3e18bnhRY78PbueryleN70qr2daU\nRYsW4eqrrwYAvPzyy8k2TjHmMffmm28m7fheNXbs2GjnLdTq77P8mu/vfh8cfsC2L3HB55H70kuk\nec8FhmVHX6GbZU0+Z/66a4xFYxsC/4xjuF+8bJWFLxfA90y+hrTgqBBCCCFEE0GTHiGEEEIUglrL\nW3lVZnmbz17ibewy9XIGZ2B41yW759h16eUodpuy29y7U9kVyu5w7/pklyG7lv3xsazi3YxZv79S\nya6uMLP4/SwdAcCNN94Yba40zC5NID2nLGl59/cDDzwQbZ9lwpIRy1ZcnRlIq8Nyv3j5gqXGLl26\nRNtnwTDf//73o+2lKK66PHz48GQbZ7OxDOGzfX7yk59kfndd8dFHH2Hy5MkAVj93fB74fHkpj19z\nlovPqGLZw2dAcTYe90VehVje5u8DPDb5uvJjk38Xt/PXB2/z35WVeeLl6yr5pb4qMm+yySYxM7Cq\nT6vgDCs+fn9+s+6tfmzyb/bXLd9b+f7kzxPf4/j4vLQ/dOjQaHN24Le+9a2k3fe+971o833FS598\nH/fHzsfLtpe3/CKrLQX/jOM+Y7vSxX7zVkzIen42VeTpEUIIIUQh0KRHCCGEEIVAkx4hhBBCFIJa\nx/R4jY/jG3gbx+YA2Rqyr7zJ23h1ZSC7cqqPn2GdOys2AEjjEPg4fPwJHzu387ERrKf7dFfWWrld\n1qq19UlVeqHXYU866aRqbQ9XhL3hhhui7VNV+RrwsRgcD8WVob3WPnjw4Ggfe+yx0fZVonn/HJfi\nqylzZWjuZx+Dtt9++0Xb/y5OZz/ttNOqtYHKVzJeEzp27Igf/vCHAFYfc1z9llPRfTwOX488dvx4\n4XHmY0l43PKY8zEXWanoHh7rla7g7KswZ+HvYXyMHLPg7z95x1sXtG7dOlbO5vg6IK2uzeUfOPYM\nAGbOnBltrvyeFzvpz0dWTJavns/j8emnn452XjXlPLha98KFC6Pt+9/HmjF8vP3794+2j3fje1hR\n4HOTF5vLVBqrk7eiQVNBnh4hhBBCFAJNeoQQQghRCGrti/KVFzmFm1Nk/WKF7FpjdypXAQbStO+8\ntFOWM/KkDsa7v7niLP8udrMCqSuQ3Xg+lZJdyF7eYqmD5ZHGkLfWNL2QFwW99NJL1/Rw6hVeaBMA\nDjnkkGrbHXPMMQ1xOPXKz372s8zXLCdOnTo1acfjhW1fkoGvG3/t8xjkMeLd6FkLevo0cL5f8Pf6\ndlmVYPMqEHtZmisBN9XU26rFTgHgvPPOq/HnWeoC0urh/jfzvYrPja/QXddcfPHF0eaFTnfdddek\nHV97fpFflta4nb+W/X2hpZB3/eaVfqiULAnZy6dNEXl6hBBCCFEINOkRQgghRCGotbyVt5BoXqVM\nlnS4KqvPvOLMG++qy3JXelc2Hwdn5fgMHXahsuSW55LnffsFNvlzLAH5Y+Tvqm2mgxA1ga/NXXbZ\npRGPRDQGXpqqb6mqNmRJz3n07Nmz7g+kGeOfhVmZzP5ZmEWlGYv1tQhvXSJPjxBCCCEKgSY9Qggh\nhCgEmvQIIYQQohDUWflEruzJFUx9XAynsOelEnI6qd/GuiHvw6et8ndz9VmvO3J8DqfC+sqjDMcc\ncRopkMYq+ZR1XqGa0/t8tV8hhBCiNvhUdI7j4edkpRWZfTmYLPwq9k0ReXqEEEIIUQg06RFCCCFE\nIagzeYtlK04/z1uAjNMMWS4C0mrI3bt3T7Zx2junffsqkbyNj8MvPsmuO19hluG0fF7Qb9ttt03a\nsfvQL+qYteBqVkVZIYQQoib4iuX8bORQCg7FyMM/F/nZyuEhlS4K3JjI0yOEEEKIQqBJjxBCCCEK\ngSY9QgghhCgEdRbT061bt2i/+eab0fZlzjm1nfHLWmRphkCaZsfapU+/43Rx3uZXgmW9k7/Lt+Nj\n55R6jvUBUp3Ua6ucSsj654477gghhBBiTfFLN/EzjpeU6NixY0X78/Gy/Ozi2FQtQyGEEEII0UTQ\npEcIIYQQhaDO5K2DDjoo2jfccENmu6zVxLt27Zq8ZpnJS2K8jdPv/Ert7Grjbb66JEtQnDruXXUs\nTb333nvR3mmnnZAFy35AmvrHbsbtt98+cx9CCCFEpfjV07PSyrt06VLR/nzoCO+Pn62bbLJJjY6z\nMZCnRwghhBCFQJMeIYQQQhSCOpO3+vXrF22Wi959992kHWc9MQMGDEheP/DAA9H2mV0Mu/H84p7s\nduPv9e3YVceZWL4KJWdz8T622GKLzOPjStX+mHh/lVbGFEIIIfLwlZF5xQPOtqp0gVD/LOTFuDnE\nxC8O3hSRp0cIIYQQhUCTHiGEEEIUAk16hBBCCFEI6iymZ9NNN412nz59ou1jerIqD++xxx7J62ef\nfTbarB8CQLt27aLNad+dOnVK2nHKOVeU9CnrnKbOKfW80rvfX9++fTO/lxk0aFDyevHixdV+lz8m\nIYQQojb4Uim9e/eO9tKlS6O91VZbVbS/YcOGJa+feeaZaC9cuDDaW2+9dY2OszGQp0cIIYQQhUCT\nHiGEEEIUAmNp50sbm70NYH79HY6ohh4hhA5f3qxmqC8bDfVny0F92bKo8/5UXzYamX1Zo0mPEEII\nIURzRfKWEEIIIQqBJj1CCCGEKASNPukxs83MbHL530Ize4Ner5vzuZ5mNj1j2yVmtm/GtpPMrIt7\n71gzu9DM9jSzYdV9TtQt5fM9w8ymlvt6lzrY55NmNnhN24gvpz76j/a9p5k9WFf7E5Wjcdn8qe0z\ntSjUWZ2e2hJCeAfAQAAws18C+CCE8Ls13OdF1b1vZq0AnARgOoA3adOBAK4BcCiADwA8s9qHRZ1h\nZkMBHAJgpxDCp2bWHkDhB2NzoSn3n5mtHUL4/MtbCk9T7ldROV/2TG3oMWJmrUIIK7+8ZcPQ6J6e\nSjCzHcxsQnmmOtXMtilvamVmfyn/ZfKImbUut7/RzI4q2/PM7CIzGwvgWACDAdxS3ldrMzOULpB3\nAZwO4Jzytt3NrIeZPVb+zsfMrDvt/3ozG2Nms8zskAY+Jc2dzgCWhBA+BYAQwpIQwpvlfnrOzKab\n2Z/LfVP1V+Bvy9fALDPbvfx+azO7rdw/twNoXfUFZnadmU0sXxu/aowf2YLJ6r95ZvYrM3vezKaZ\nWR8AMLMNzeyGct++YGaHl9/vWR5Dz5f/reZlNbMh5c/0MrNBZjbazCaZ2cNm1rnc5kkzu8zMRgP4\nUcOdhhaHxmULpfzM+r2ZPQHgt2Y20MyeLffRPWa2abld9LiZWXszm1e2q30Gm9nx9P6frORYgJl9\nYCXFZTyAoY3yozNoFpMelCYjfwghDERp0vJ6+f1tAPxfCGEHAMsAHJnx+U9CCLuFEP4JYCKA40II\nA0MIHwPYEcCUEMJcANcD+N/ytjEA/gjgHyGE/gBuQckbVEVPAMMBfBXA9WaWLqcu8ngEQLfyjXKE\nmQ0vv//HEMKQEEJflG6UPJlcO4SwM4CzAVxcfu/7AD4q98+vAXD56wtDCIMB9Acw3Mz61+PvKRpZ\n/QeUHpo7AbgOwI/L710I4PEQwhAAewG40sw2BLAYwH7l9kcjHV8oT4KuB3A4gAUArgVwVAhhEIAb\nUOrzKtqGEIaHEK6q6x9bIDQuWza9AewbQjgPwD8AXFDuo2lY1XdZrPYMNrPtUBq3Xym/vxLAceX2\nGwKYHkLYJYQwts5/yRrQ6PJWhYwDcKGZbQFgZAjhlfIfG3NDCJPLbSahNBGpjttz9n0ggFEZ24YC\n+HrZvhnAFbTtjhDCFwBeMbM5APoAmAzxpYQQPjCzQQB2R+kheLuZ/QTA+2b2/wBsAKAdgBkAHih/\nbGT5f+7nPVB+UIYQpprZVPqab5rZ91C6xjsD2B4Abxe1JKf/gLSfqsbO/gAOM7OqSdD6ALqjJDH/\n0cwGonTDXFUrH9gOwJ8B7F/2NvQF0BfAf8tjvxWAt6h93hgXFaBx2eK5M4Sw0szaoPRHwujy+zcB\nuPNLPlvdM3gflCa0z5XHZGuU/pABSuP57jr/BXVAk5z0mNkRWDXzPDWEcGvZTfZVAA+b2akA5gD4\nlD62EuRGdXyY83X7I9tD5AkZdnWvRQ5ljfdJAE+a2TQAp6H019/gEMICK2nR7D2r6uuVSK/b1c67\nmW2JkpdhSAhhqZnd6PYl1pBq+u/E8qbq+skAHBlCeJn3Ue7jRQAGoOR1/oQ2v4VSn+2I0uTIAMwI\nIWS5yvPGuKgQjcsWTSVj5HOsUoBi32Q8gw3ATSGEn1azn0+aUhwP0yTlrRDCPWWJaWAIYaKZ9QIw\nJ4RwDYD7URqEteV9ABsDQHnGu3Y58CvZVuYZAMeU7eMAsJvuG2a2lpltBaAXgOSGLrIxs21tVVwW\nUIqpqjp/S8xsIwBHVbCrp1B2p5Y9AVXXxSYoDfDlZtYJwEF1cdyiREb/5VWdfRjADykWpGrV4TYA\n3ip7TE9AyXtTxTKUbrCXmdmeKF0fHawUbAszW8fMdljjHyMiGpfFIISwHMDSqhgslMZelddnHlbJ\nkbGvM57BjwE4ysw6ltu0M7Me9f8L1owm6emphqMBHG9mKwAsBHAJSgOoNtyIUgzOxwCuAvAobXsA\nwF1WCrT8IYCzANxgZucDeBvAydT2ZZQulE4ATg8h8F+pIp+NAFxrZm1R+sviVQDfQ+lBNw2lgfdc\nBfu5DsDfy+7zyQAmAEAIYYqZvYCSG34OgKfr9OhFVv9lBfRfCuBqAFPLE5955bYjANxtZt8A8ATc\nX6IhhEVmdihK8vMpKN2Er6n6Y6W8zxl1+LuKjsZlcTgRpefgBij1RdWz7XcA7jCzEwA8Tu1XewaH\nEN41s58DeMTM1gKwAsAP0MSX3Sj0MhRm9lcAfw0hPFvDz90I4MEQwl31cmBCCCGEqHOai6enXggh\nnNrYxyCEEEKIhqHQnh4hhBBCFIcmGcgshBBCCFHXaNIjhBBCiEKgSY8QQgghCoEmPUIIIYQoBJr0\nCCGEEKIQaNIjhBBCiELw/wHZYWLe84IZ6QAAAABJRU5ErkJggg==\n",
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAj0AAAI8CAYAAAAazRqkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90\nbGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsT\nAAALEwEAmpwYAACi6klEQVR4nO2deZwU1dX+nyNuIAoii4AsIiIiq+CCUXHf4hKjiRo1LtFojDEu\n8WcSEzX6RqPGvIn6omYxGqNxxTUSjRuiIsi+KiqLiAKOAu57/f7onstzD1NFz9Az0zP1fD8fPpzu\nul1dXbdu1Z3znHOuJUkCIYQQQojmzjqNfQBCCCGEEA2BJj1CCCGEyAWa9AghhBAiF2jSI4QQQohc\noEmPEEIIIXKBJj1CCCGEyAXr1qZx+/btk549e9bToYiaWLBgAaqqqqzc+62Uvvz666+D/dFHHwV7\n4403rtP+Pv7442Cvs86qOf2GG25Yp/2Vm0mTJlUlSdKh3PutlP784IMPgr106dJgt2rVKmr3xRdf\nBHuDDTYINl8PAPDVV1/V+D2ff/559Hqrrbaq/cGuJc1tbH755ZfR63feeSfYLVq0CDaPKw+3y4JL\npay7bvwY4rFvVvbTm0p9jM1KGZcM32f9ePOv0+B26623XrBbt269lkdXHrL6slaTnp49e2LixInl\nOSpREsOGDauX/VZKX/JDcsKECcHee++967S/yZMnB5sHYJ8+feq0v3JjZgvrY78N2Z/8wPIPpSef\nfDLY1157bbAHDx4ctVuyZEmwe/fuHewPP/wward8+fJg88Nx/vz5Ubv777+/lEMvK81tbPIkBwBu\nuummYLdt2zbYLVu2TN1HmzZtgu2vDZ7A8qS1Y8eOUbs99tgj2Ouvv372QZeR+hibpfYlTyL8pDKt\nll5dJ4Tjxo0LNv+RCMT9kvYHBwB89tlnwe7QYdXcYvfdd6/TMZWbrL6UvCWEEEKIXFArT48QpfDp\np59Gr//4xz8G+1//+le0jf+S5780/V+T3C4LlrHY9i50/ovk1FNPDfYBBxxQ0vfkmSxPz8UXXxzs\n559/PtgPPfRQ6v422WSTYPu/PFly4Wvik08+ido98sgjwT744INTv0ukc88990Sv/+d//ifYm266\nabA7d+4ctWOvW9euXYPtvatz5swJNo/NffbZJ2rHsujxxx9f0rE3dXgcZa2SkOXdYa/5U089FW1j\nD/jo0aODvc0226Tun72u7777btRus802Czbf73/7299G7Q455JBgH3roocHu3r17yq+of+TpEUII\nIUQu0KRHCCGEELlAkx4hhBBC5ALF9IiycMEFFwT7z3/+c7Tt/fffD7ZPXeY4DY4b8DEbG220UbA5\nq4DTnf3+WBvnbAMA+Pe//x1sjjcZPnx41O7ZZ5+FiMlKWZ42bVqwuT85wwOI02a5P9u1axe143RY\n7s/XXnstavfyyy8HWzE9dcNnb3GqdVafb7755sHmvvRxICtXrgw2x3EtXrw4ate3b9/SDrgZkRXT\nkxbH4++zr7zySrB96jmf06OOOirYU6dOjdrx/ZTj6XzsD5cV4Huzv4YWLlyVRHXOOefU+BkA+N3v\nfhfsLl26oD6Rp0cIIYQQuUCTHiGEEELkAslbos6we/Wqq64KNru7gdiV6V217MrlKr2+gjK/5n14\nt7uvKpu2Py5cyFVkOc0aiFMuH3744Rr3LVbBaa7t27cPNkucQOx+z6rIzPvzUiazaNGi2h+siPBy\nFEuSr7/+erC9BMmp0ix7rFixImrHY52vBz+GBwwYUIujbh5klYFgRo4cGez33nsv2rblllsGm2Vh\nIJYduRjkiBEjonajRo0KNt/HfZFI7jPuL06HB4Ctt9462Fy4kmUvAPjVr34V7Jtvvhn1iTw9Qggh\nhMgFmvQIIYQQIhdI3hJ15te//nWwORsja80dXnPJw+v7eDmKKyqz5OGrP3OlUP5eX5GZs7nYtdyp\nU6eoHWdvVVVVRdtYvskrXD3Xw+c8y2XPkqR3y7P0yPvg6w0Ali1btuaDFZn06NEjes2ZeNwPflFR\nlq9ZBvFrN7FcwhXWszKN8kKWvMXSLdu9evWK2vl16xjuIx6zfqFefv3qq68G20uaO+20U7D5Hukz\nr/j+zNXWfcV9fi7cdttt0Tauyl2qDJiFPD1CCCGEyAWa9AghhBAiF2jSI4QQQohcoJgeUWe4wiqn\nE/uKoqzX/uhHP4q2nXbaacHefvvtg+0rdr755pvB5rRYH4fAejUfE38eiFeD5nacfgvElaHnzZsX\nbVNMDzBz5szUbRzf4Stsc1wIx/74+A6+ltLS3IHV461E7fExEpyKzOPRj29OZ8+K1fGrrlfj40p8\n/F0eyKp4zdXHedz48hxchsNXoOf4Km7nywoceOCBwX7uueeC7WNw+LvZ9uVKuPI631s///zzqB2P\n5ylTpkTbOKanrnE8jDw9QgghhMgFmvQIIYQQIhfkz48oyga7UDnF3Lu/mSuuuCJ6zVU62R3O6Y0A\nsMceewT76aefTt3/tttuG2xehNJXBP7Tn/4UbE699wtjsluY3b0AsOOOO6YeR17gtGYglrT4mvD9\nyamsLJNyyQEgfSFG7773cqioPV5i6datW7D79esXbC8x3HPPPcHmKsGzZs2K2u2+++7BHjp0aLBZ\nagZi6cMvUJxH+DzymPJjgMeHP298b2WJzN8XO3fuHOz99tuvxs/41717967xGIA4tIFlMF9qhJkw\nYULqtnIgT48QQgghcoEmPUIIIYTIBZK3agFLHewKzooo9y5IjlLnipe8MFul4iPuGT4H/jcz3//+\n96PXDz74YI3tOAsEiCWtiy66KNi+Mu+dd94ZbHa1+wXujjrqqGCzvOWryHImydSpU2s81jzz0ksv\nRa95XLCk5TNyWNLirD1/jjfddNNg89jxchlLMaJusDQMAE8++WSN23zm3HbbbRdslnx/+MMfRu26\nd+8e7C222CLY3MfA6plCeYczT/l+l3Wf9ZXlebxkVUBnKY2z9/z9mCsvv/XWW8H22WCcTcuZXV76\n5MVSvcTNzx2/8GldkKdHCCGEELlAkx4hhBBC5AJNeoQQQgiRC5p1TA+nz7HtUzMXL14c7HHjxgWb\nq1MCdUuL9fo3M2rUqGBfcMEFtd53Q8ParYfPqa++y/jKyGlwGqyHK3R6/Z9jcgYNGhTst99+O2rH\nVUlLhWOwRIE5c+ZErzlGgK8JvwI0p8a++OKLwfbxcZxqy7avRutXgRa1x8dJ8f2OU499DA7D/eJj\nTrj/OPXax3txOnPW/bO5wnEwHh5HPn5m4MCBwfaxOj5WsRqfis7nm/fv4zn5efrFF18E299neX+8\nD3/sjK/kPX369GAPGzYs9XOlIk+PEEIIIXKBJj1CCCGEyAXNWt5ishZ0Gzt2bLDHjx8fbC/nnHXW\nWbX+3mXLlkWvH3vssWDzwplNgXfeeaekdl56YFerP6felVnNiBEjUve///77B3v+/PnRNpY5Ro8e\nHWyu6AzE0hdLXf542P3LLn5RgFPPgfh8Zclb3/72t0vaP19LWdV5s8opiNLw8j3LXdyXfgxzHw0e\nPDjYXqpk2Zv7y0svXprJG35hY74/sfTHi3kC8fnmch1ALEFlVUNOq9zs+5Kfa7zN75u/l68nH17A\nUqiXO/keL3lLCCGEEKJENOkRQgghRC7QpEcIIYQQuaBZx/SwVsw6oS+dz2m3XL7bpygffvjhwea0\nTa9j9ujRI9jvvvtutI1XtfWrC1c6nNrvyVpZnWMxfFwMxwrwPl555ZWoHaf0e82bSVtl/Y033oja\njRw5MticMu3TcTnlMuv35xWfXltqWYdjjjmmxvd9ijLHJrRv3z51fz7dWtQeX/6Bx2ZWiQfeNmTI\nkNR23Ef8Xb7P8x7Ts2jRoug1p/enxUAC8VI7PXv2jLbx8g38XPTxlxxnyv3gl/Hh4+BnKx+r/y6+\n9/v7BH+X73//LFhb5OkRQgghRC7QpEcIIYQQuaBZyVve9cduN07vu/fee6N27F5lqeqDDz6I2qVV\nePbSDq8gy6sJA7F8klYls1LJSlnn9EbvMuXX3k3+y1/+ssZ2jz/+eNRu2rRpwebzy3IhEEtaLInx\nqupA+orp/hridExOvxQFfPVtdo9nXd977rlnje8PHz48es0V0v11xfiVmUXt8dV5WWbgceDTl9Ok\nLy+X8fhhecN/r3+dN3xJAJYZs1ZZ5+eVL9HCYyfrXs2f4/37+yKPc16B3ctbfH/gY/fPkjZt2qR+\nF9/7y4E8PUIIIYTIBZr0CCGEECIXNKq8xbJQ1kKDfhu/Zhd6llv0xhtvDDZnaAGxS46j1H1WFn+O\n3YL++Nh16zMTuIItuw99dc26LG5a3/jF5Ji0LCwgPlfsxgSAK664osb9+XZ87mfPnp16HJtvvnmw\nq6qqgu3drml4ScZXB01rm3eXfE2wnOHPY9pCkj7r5Lnnngt2Voagv15E7fHZcXxf4/uxr36dNrZ8\nxXnuP/6Mz2LNqp6fB3z1cs684vAIn1F12GGHpe6D+5JlSy+R8eus8ZtW4dk/C7mf+/btG+wHH3ww\nasd97rO3shawrgv5vrqEEEIIkRs06RFCCCFELtCkRwghhBC5oN5jerwOn5X6yGTpuqXGUvzrX/8K\nNleD9FVDOeZkxYoVweYVu4E4LZbjRbx+mpVay+eDK5T66s+8WnGlUOoq66xBA8Bee+0VbF7RHohT\n+rkvvdbMfZ5VHZbPPccB+f3xPtq2bRtsn8rurwFmwYIFwd5qq61S2+UJHtMc+1Hq+fElHrjfs+4X\nYu3p3Llz9Jr7L+2+BaSXcvD3QY5T5PTlrCrDecTHknLqf1aMYb9+/YLt77Np90z/nOXnH8cP+XYc\nd8PHlBV316dPn2D7OB3+XFYcbDmQp0cIIYQQuUCTHiGEEELkgnqXt7Jc0uzW9C5Oljr8PtIkrZtv\nvjl6PXfu3GB369Yt2H4RUHatsdvNp1JyxUs+Jl5QE4jdk1nyHvPYY49FrytR3mLXp4fPjT9vJ554\nYrBHjx4dbfPnrhp/PZTqAufzy+51L2+xS/bb3/52sNMqNdcES5yStwqkVUHfbrvtSvr8QQcdFL2+\n6qqrgi0ZpH7xY5Ffszzi+4EXhWV82Q0egyxhqJp2fK/yciFLvPw88andXbp0qbGdh+VJL5fxmOV+\nySob48MZ0o69d+/eNR6Db+d/P4ePsJ0V5pCFPD1CCCGEyAWa9AghhBAiF5RN3kpzPXu3GLvdOCK8\nNlU4eUG2UaNGBdtHhG+99dbBZreYlzpY7kpbZA9Y3SVXjT92dt36bezy5f0///zzNe67kvCyIMPn\nvmPHjtE2zgLw8PnOqnJd6vWRVq3b74+vgZ122il1f/y9vvKs5JbVSXPF9+rVq6TPDxo0KHrNGURZ\nWZGVWMG8qeHDBvic8rXu+6FDhw417o/vv0B8j0ir6JtXWCrPCong8eXlLe4X30csY/GY8hIRS5rc\nR/7Zyvd4vm78sfM2lt+y7ud+oVr+/ZyFzXJZbZCnRwghhBC5QJMeIYQQQuQCTXqEEEIIkQtqHdNT\nrSl6/bcuMReMr/bL1W5feeWVaBuv9s3pclzlE4hTrN9///1g+5Q4ju/g38XHAMQ6KVfx9Sl7aXEN\nQKxXZlUZnjlzJoDyrzC7NviUdY5xYV3ex1fMmTMndZ+sNadVdgVKr8bL5zur+jf/llLLKvi+ZB0+\nr/gKypzyyvcE1vOzyKo4q5iehoXPN6el+35Ii9njCsEAsGjRomDz/djHcOQRvh/588v3WW7XvXv3\nqB2vas/jEIir0/P+s0rF8D3dP4e4XdY9nEuZcFwtH4/fHz8Xgfi+u2zZsmArpkcIIYQQIgNNeoQQ\nQgiRC2otb6VVQ166dGmwFy5cGGzvZuPX7DKbP39+1C6raiS78dg95xcm4/3zPnzqObtXOd2cU/uA\neEE+ds/6/bG71y9Gym5ilrQ4FY/beVdfY1JqivY222wTvX799ddT27K0xPvPKnWQRVpFZr+IHe/P\np9gzWfJWqQuwNmf8uZs3b16w+fxzdfQssqq7ZklfaeUkRN3hEhWcfu6rqp922mk1fn777bePXk+Y\nMCHYXLVdpR/i+5YPdeB7F4d69O3bN2rHn8saK1nVj/k4+Hu9BMnSV1ZpEH7esQQ9YMCAqB3LYF4u\n5X3652ldkKdHCCGEELlAkx4hhBBC5II6V2R+4oknotdcJZlda14CYFdYWgQ4EEtY3qXFUhBLDr7S\nMrvJ2IXq98fHxC4472bkjK1SpQ3vquOMFpbfvJSW5Z5sLLwrNO0Yvbw1ZsyY1H2mZeR4KYn7LytT\nkD/HdposC8QZSD4bKStDqxyu1qbOjjvuGL3mTD12j9dmIdc0/PhmvHwp1h4et6+99lqwvbx12223\n1fj5/v37R69Z6rj++uuD7atwDx06tPYH28RhKdHfV/k5wSEc/rzxM4nDL4BYIuL7uB9TXJGZj8Pf\nc/mY+N7sq0TzffeNN94Itl+g+YUXXqhx30As4/nfVRfk6RFCCCFELtCkRwghhBC5QJMeIYQQQuSC\nWgWOvP/++3j88ccBAH/729+ibay7cWo3x+YAsf7H6alZVRj9Pjj+hbVGTnvz+2Cd0KfV8XdzvBCn\n4QPA7NmzazyGrLRyHxfEKftcadO3q04F9hppY+LTFtPiZLz++/LLLwfb/561Tcn3n0+rwpwVI8Xx\nCptvvnm0ja8Hf+xKkwZ233336PXf//73YPP4njJlSp32z9dSVkXmUivCi3R8HB2PLR4jfvV0vo8x\nfsxxPAqnr2f1a16YPHlysH1MC7/mZ5KPF504cWKwOTYHiMcH2/7+yWOW+8W349ccT+dj6/gamDZt\nWrD96gn8bPG/n++z/BuPPPJI1AXdKYQQQgiRCzTpEUIIIUQuqJW8tdFGG4UU1RdffDHaNmPGjGA/\n99xzqftgiYBlq3bt2kXt+HWbNm2ibSwtsUuW0/6AuHolu8iy0vnYBTdw4MCoXc+ePYP93//+N9g+\n7S/L1c7uPl6E0bv7qqW6SqrI7N3VacfmU9s5VdW7XetSjbXUxUdZfstyoT/44IPB5j4GYrez79fl\ny5eXdBzNmV122SV6zVIHn/+sqtdZ8LjIqsqtqr5rjx9XfJ9lyaHU8gC+DAffF1jqylpoOC9wqRSu\ndgwAixcvDjaHcPiUdX52cXkVIF2K92OKn2Vp5WWAOByDJTHfjq8pXsD70EMPjdr94Ac/CPZ3v/vd\naBs/Mzh0pq7I0yOEEEKIXKBJjxBCCCFyQa3krRYtWgS32UUXXZTajivVjh8/PtrGkhNXYWTXFwBM\nnz492H7RUnbJsfvMyw8skfECZ/vss0/U7qCDDgp2WiaCh91zXGkSADbbbLNge9mKJT2Wi7zLuE+f\nPrU6nobAn1/vhq2Gs7WA2GXqfye7ttk1miVlpFVdBtKlryz5g689L2nee++9qfuWWx7o0aNH9Jqv\nd+53f63wwqS9evVK3T/L4Vnnu5Jk4OYCyxYcEsBSTBY+25Hvd9yXPmMyj5x00kmp2/h5yuPGVzUe\nNWpUsH1mF++D74VeBuMK9CxP+vHL4QJs+2cEy9ocEuMXqeVq0j6TudzPQHl6hBBCCJELNOkRQggh\nRC7QpEcIIYQQuaBelvJmTW7vvfeOtvHrM844oz6+vkF46KGHGuR7KqnSrI/HSYuT8ancrAf7fZRa\n1Zlfp1Vd9q+zYn+4DMK4ceOCXR1LVRP+u3zlUBHH8bDW79OXS43p4RRVjr3yMQuK6Sk/XCWXx3Cp\nMRYcEwTEYzCtMr9YHX6ecsyhX4GAS7b4EjBpMVQ+lZ33wf3l+4jvxzzms/qSv2vq1KnRNo6rrW8q\n54kqhBBCCFGPaNIjhBBCiFxQL/KWaJ74FFSulMkpkeeee27U7oknngi2l4RKle/SJK1Sq/T67+GK\nsHvssUewDz744Kjdb37zm2B7Kc5X4s4LaSUjAODwww8P9h133BFsL4Vy1XZfQoLxFbxrOgZgdblL\nrD282C7Lh6VWv/apx2kLXfqFjPOOv7b5fPM9yK98kLWoMo8j3h8vJAukS818Lfh9sPTpyxlw33bt\n2jXYzz77bNSO5a1Sy5DUFXl6hBBCCJELNOkRQgghRC7QpEcIIYQQuUAxPaJk/HIgrC9nLRfQoUOH\nYL/66qvRNtaQy7FSdlq8iY9H4rR6LpXevn371H37uKCFCxfW+TibMlkxPYcddliwb7311mD7VNb7\n7rsv2Jdccknqd3HsR1apglJX/hal06lTp2AvW7Ys2GllJjw+ziotzZnHn1j92k4737ykExAvKeHj\nDTnehz+35ZZbRu04JodXd/f743s1x2n6eBwe92z7GCHG//6s+01dkKdHCCGEELlAkx4hhBBC5ALJ\nW6JkvvGNb0SvuZIxV2n1VY3nzp1bvwdWRrhSMABsvPHGwfYu3h133LFBjqnSyCoFcOCBBwab5Q1/\n7kotVdC/f/9gz5gxI9i+KvDbb79d0v5E6XBfTpw4Mdilyls8doC4CjqnOffo0aOuh5gLWOLlc+/l\ndR5j/h7Mn+vbt2+wfeXm2bNnB5ulJB+ywHIZ9zP3MRBXYufj85WgeZuXqiVvCSGEEELUAU16hBBC\nCJELJG+JkvFyDkftc2R+JS2SWlu8G5fdrn7RTF99NC+UKm+wbPHiiy9G29i9/cILLwR7l112idqx\na58lEd8XVVVVJR2TKB2WEPncl9r/Hr5fcP9vscUWddpfXkiTdC6//PLo9dVXXx3s0aNHR9tWrFgR\nbM7Y8lWcuY84q84vIv3+++/XuM1nZXFGGWfGnnnmmVG7rOzLcj9Pmu7TSQghhBCiFmjSI4QQQohc\noEmPEEIIIXKBYnpEyfAquQAwZMiQYLP+nxXr8uWXX0avOT4ga8X0csPfxcfQu3fvqN03v/nNYLMu\nDgDDhw+vn4OrcEpNGz311FODzWmyAHD00UcH28fxMMcff3ywV65cGWy/gvduu+1W0jGJ0vn+978f\nbF7Rm1PZa8Ohhx5a4/sDBgyo0/7yQlpMi1+d/qKLLkrdxxtvvBFsTktfunRp1I5jdbIq5HOFe7a7\nd+8eteMyJ37MNhby9AghhBAiF2jSI4QQQohcYLWRFMzsHQD5XGWx8eiRJEmHNTerHerLRkP92XxQ\nXzYvyt6f6stGI7UvazXpEUIIIYRoqkjeEkIIIUQu0KRHCCGEELmgIiY9ZjjcDIkZ+q65NWCGBWZo\nX8P7H9bye2vVPmM/J5qhSzn21Vwxw1dmmGqGmWa4xwyt1tD+GTMMK9o19rdoPMywWbE/p5phiRkW\n0+v117wHUUmoP/ODGS40wywzTC/2704Zz9RDzfDzlP3sYYb0ehMVSqXU6TkGwHMAjgZwSeMeSp04\nEcBMAG818nFUMp8kCQYDgBluB3A6gD806hEVjsUAWJIgvSiFWI0kwbtA6M9LAHyYJPh99XYzrJsk\n+LLmT5cfM7RIEny15paiJtSf+cAMwwEcDGD7JMFnxYlO6qQ2SfAQgIdq2M+6APYA8CGAF/z2SqbR\nPT1maA3gGwB+gMKkp/r9PYp/7d9rhpfNcHvxAcWfbWmG/5jhVLdbmOF8M7xUnM3+JuP7rzHDZDM8\naYYOxfcGm+HF4mfvN8Omae+b4UgAwwDcXpw1t0z7LhEYC6B3sY8fqX7TDNeb4cSsD5rh3KK3aKYZ\nzi6+d6UZzqA2l5jhvKK92nVghp5mmGOGkQAmA+hW9l+YQ8xwixn+YIanAVyZMY7Yi9feDAuK9nZm\nmFAcR9PNsHXx/ePo/ZvM0KL4/odmuNQM4wHks1JkPaL+bJZ0BlCVJPgMAJIEVUkS/lj/SfFZOMOK\nqosVVIzrizZfD3eh8IfrOcV+bDLVQRt90gPgWwD+kySYC+A9M2xP24YAOBtAPwC9UJgcVdMawMMA\n7kgS/IV3aIb9AGwNYEcU/noZaobda/jujQBMThJsD2AMgIuL7/8DwAVJgoEAZmS9nyS4F8BEAMcm\nCQYnCT6BSKX4F8KBKJy/2n52KICTAOwEYGcAp5phCIA7ARxFTb8L4J41XAfbAPhHkmBIkiiltIz0\nAbBPkuA8pI+jNE4H8KeiR3AYgDfNsC0KffuN4vtfATi22H4jADOTBDslCZ6rYX9i7VF/Ni8eB9DN\nDHPNMNIMI2hbVfFZeAOAn6V8vvp6OALAjQD+t/jcG1u/h10+KmHScwwKDy0U/z+Gtk1IErxZlB6m\nAuhJ2x4E8PckwT9q2Od+xX9TUPhLvi8KDz/P1yjMWAHgnwB2NUMbAG2TBGOK798KYPe090v9kQIt\nzTAVhQniGwD+Vod97Arg/iTBR0mCDwGMArBbkmAKgI5m6GKGQQCWJwneQPZ1sDBJ8OJa/SJRE/ck\nCb6q43gZB+CXZrgAQI/iHxB7AxgK4KXi9bM3Cn8AAYUH5n3l/gEiQv3ZjCjeN4cC+CGAdwDcRd71\nUcX/JyF+1jL3NHXZsVFjesywGYC9APQ3QwKgBYDEDP+v2OQzav4V4uN9HsCBZrgjSeCLDRmAK5IE\nN9XykFS0qP4IMT3VmOFLxBPvDZFN1qJP9wI4EsDmWDWJrvE6MENPAB+t+ZBFHSjlvHK/hz5PEtxR\nlDa+CeAxM5yCQh/emiT4RQ37+bSp34CbAOrPZkbxHD8D4BkzzABwQnFT9fPWP2uZJn/fbGxPz5Eo\nSAw9kgQ9kwTdAMxH4S/6NXERgHcBjKxh22MATrZCvBDM0NUMHWtot07xGADgewCeSxKsBLCcNMrj\nAYxJe79ofwBg4xKOWcQsBNDPDBsU/5Lcew3tnwXwLTO0MsNGAA4Hglv1ThRiwo5EYQIElH4diDKz\nhvGyAIW/NoFV4w9m6AVgXpLgWhSCJwcCeBLAkdX9ZoZ2ZuhR/79AMOrP5oEZtqmOrSoyGHWvGN0k\nn3uNnb11DIDfuffuQ2ECctfqzVfjbAA3m+GqJAneISQJHi9qx+Os4Bv4EMBxAJa5z38EYDszTAKw\nEqviQk4AcKMV0qrnoRBHkvX+LcX3PwEwXHE9pZEkWGSGuwFMB/AqCjJUVvvJZrgFwITiW38tSltI\nEswyw8YAFicJ3i6+l3Yd6K/JhiFtvPwewN1mOB7AU9T+KADHmeELAEsAXJokeM8MvwLwuBnWAfAF\ngB9Dpf0bA/Vn06c1gOvM0BYFD91rKEhdB9dhXw8DuNcMhwH4SVOJ69EyFEIIIYTIBY0tbwkhhBBC\nNAia9AghhBAiF2jSI4QQQohcoEmPEEIIIXKBJj1CCCGEyAWa9AghhBAiF9SqTk/79u2Tnj171tOh\nlM4rr7wSbDOr0QYATsdff/31a3wfAL744otgr7NO+jyQP7f11jWtalF+FixYgKqqqqxKxHWiUvqS\n+eqrVeVzWrRoEW377LNVxbm//HLVYs++z/l1y5aVt/brpEmTqpIk6VDu/VZifzLvvvtu9Pqjj1YV\nduVx5ft9ww1XFelu3759PR1d3cjT2MwD9TE21ZeNQ1Zf1mrS07NnT0ycOLE8R7UW7LHHHsHmm+QG\nG2wQtfv000+DzRcevw8AS5cuDfbGG68qMMkPYf/60Ucfrd1B15Fhw4bVy34bsi/5oeYnKczy5cuD\nvemmm0bbXn/99WBXVVUF2z8k+RoYMGBA7Q+2njGzeinC1pD9+fXXXwfb/wHh+6Oaf/wjXiJv3Lhx\nweZJrO/3vn37Bvvkk09OPaZSr7G0z9Tmc9U0h7EpVlEfY1N92Thk9WVjV2Quiffffz96PWvWrGB3\n6JA+Mf/kk1WFkfmhyX89ArF3p1WrVsH+/PPPo3ZZ3yVW4SeL/CD0DxqepLDHzXtpuC/btm1b42cA\nYL311gv2qaeeGuyrrrqqlEMXJZDlDWWmT58e7BNOOCHaNnz48Br3x/0HAP/7v/9b4z785IonLKVO\ngGo7yRFCNH0U0yOEEEKIXKBJjxBCCCFygSY9QgghhMgFTSKmxwcesxbP8SOcoeVfc4CkjznhmCGO\nL/BR95WYDVSJpAWzAsBdd90Vvb7ooouCzTEg99xzT9Tu/PPPD/aUKasWY3/iiSeidvvss0+wzzjj\njGBzsCwArLvuqku/LkGwosDLL78cveakgI4dOwZ7/PjxUbuLL7442CtXrgy2H2N//etfg/3ss88G\n+7nnnovaXXDBBcH29wEhhKhGnh4hhBBC5AJNeoQQQgiRC5qEvHXfffdFr7nQ2RZbbBFsL1txPRFO\njeb3gTgdmmUQdrsDwFtvvRXsSZMmBXvo0KHZP0AEWFYCgC5dugT7V7/6VbAPOuigqN1//vOfYM+f\nPz91/yNHjgx2qUXBJGllw9c6ADzwwAPB5jEBAN/4xjeCvWLFimC3a9cuarfNNtsEe9myZcH28tag\nQYOCzSUkNtlkk6gdlyQYMWJEsLfddtuoXaUVOBRCNCzy9AghhBAiF2jSI4QQQohc0CTkLc7gAIDO\nnTsHmzNEOHMEiKWURYsWBZurLgNxthFXa/ZSDLvhJ0yYEOy8yFtZZftZepg8eXLUjmUOn4n32muv\nBXvmzJnB9st8cBVm7v+5c+emHi+v0cZrdwGxrMZVnTt16hS1K7X6cHODs6H23nvvaBtLRCxTAUD/\n/v2DvWDBgmDfdtttUTseM3369Am276eHHnoo2Pvvv3+wvWz14osvBpsz+vh9APjWt74V7IZaP08I\nUTnk844uhBBCiNyhSY8QQgghcoEmPUIIIYTIBU0ipodjMwBg2LBhweZ0c7/iNqemt27dOth+9XRO\nU2/Tpk2NNhDHd/hU3TyQldo9e/bsYL/00kvRNo774PgNABg8eHCwFy9eHOwPP/wwasdp0kOGDAl2\nVVVV1I6vh4022ijYXOYAAF599dVgcwVfv8p3nlKcZ8yYEWyOpbnyyiujdlwKwMe99erVq8Z2y5cv\nj9qddNJJwZ43b16wP/7446jd1KlTg73TTjultuMYra5du9b4eQD4wx/+EOwbbrgBQoh8IU+PEEII\nIXKBJj1CCCGEyAUVK2+9/fbbwfaVljlNndPIfXoxyxacss5p6UAsfbFE5hep5M9xhWcRyxe9e/eO\ntrFU1aFDh2gbL/a62WabBdvLShMnTgw2lwvgFGkAeOedd4L9wQcfBJsXnPXfxdcNy2N5gysvcwXs\nm2++OWr34IMPBpvPIxCnkvNipA8//HDUjvudU9t92QmWIbmcgJe8WSLj6s/9+vWL2n3zm9+EECK/\nyNMjhBBCiFygSY8QQgghckHFyltLliwJtq+gzHCVYL9YIWf2cMYXV/4FYvmFpS4vq6VVbs4rfN5Y\nSuKKyUCcCTRgwIBom6/QXA33AxBn3LEE5bOtuM8428xfQ/yaM4F8VlCeeOqpp4K95ZZbBpsz7IA4\nq9H3E8uNCxcuDLa/Jvbaa69gv/7668H2GZicUcaSp5fBWPry+2DefPPNYPvMvzxl6gmRV+TpEUII\nIUQu0KRHCCGEELlAkx4hhBBC5IKKjenh1bN93AZX2mV8xWBOe+e4Aa7oC8Tprz169Ag2p7wDcfVZ\npazHq6fz6tibb7551I7jLzilHIj7MitmapNNNgk2Xw9+5XdOoebYE67O7V9zvJCPB+Hf1dz7nNPI\nucQDx8MBcXyOj8lq27ZtsLmMgR/DvML5ypUrg+1jr3hsctwYfw8QXy8jRowI9n333Re14xR4X6Vb\nMT2iucL3SbZ9mZe68OyzzwZ79913X+v9lcpHH30UvU6bF3jk6RFCCCFELtCkRwghhBC5oGLlLa7m\n6l3e7NZiSYTlFmD16r/V7LzzztFrXpSQJTKWNvw2L33lET7ffD689MDVkP055W1ZbleWUbg0gZfB\nuB2ntvtriCU3rrztj50ln7TrqbmQJk09+uijUTs+D76CNUubXGmZbf+axzpXUwbiSsunnHJKsP2C\nvzyGx4wZE+wXXnghasfXgb8WhWiu8LMra+Fo5qyzzgr2G2+8EW3bbbfdgv3kk08Gm0tdAEC3bt1K\n+i6+B/tFjJmrr7462Pfcc0+0jUtuZCFPjxBCCCFygSY9QgghhMgFmvQIIYQQIhdUbEzPa6+9FmxO\nPQbiFGNOhfU6/4knnljjvk8++eTo9Y033hhsn9rMcPwQ23mF4zk4psefG27nS/9zfAjH8WTpzqz5\n+qVCuP84xdzrxBwLlKUh52m5kaFDhwb7hBNOCLaPi+E4m/feey/axmUiOC6IlywB4ngwTkX3/clp\n5byEBKeeA/HyIWnLzwBx3JKPHxKiKeGfVXWJ2+GxDAA77LBDsL/3ve8Fe/vtt4/a8T2ey4T85Cc/\nido98MADJR1H1j34tttuC/add94ZbH9P4djALOTpEUIIIUQu0KRHCCGEELmgYuUtThX2q6ez644r\n6PpqumeffXaN+2YXnt8fuwy9TMNyieStOD08q5oyp4ez5AHEcgNLGSxhArH7k7/Llw5giYz7kis6\nA3FaM1fo9m5hX/G5OcErmAPAv/71r2Afc8wxwfZudE4v9dIzr7rO23x/ZlXBZtIqbHt3OPc7XxMH\nHHBA1G7JkiXBfvrpp6Ntxx9/fOpxNBd8FWqWN1giBOI05f79+wf7z3/+c9SOz1uXLl2C7a8NLk/B\n+Our1CrBPDZLlXMqlazfkrYt6zz58cbXPd/v/DPyggsuCPbAgQOD7UtOcNmYbbfdNthPPPFE1I77\n/Je//GWwv/Wtb0XtOEzlueeei7aNHDmyxnaDBg2K2nXt2hWlIE+PEEIIIXKBJj1CCCGEyAUVK2+x\n+5pd5kDs4uPMIL/QZa9evUr6Ll5okF2tPruDXcO+Em0eYRcqV7r1khBLlb6POIsny3XL1wP3kXcF\ns/uTZRjPvffeG+w+ffoEm93zwOoLajYn/IJ97AK/5ZZbgu0rMl988cXB5nMHAJ06dQo2y1aLFy+O\n2g0fPjzY3NcdO3aM2vEY5EVKfTuWTQ8//PBgz5kzJ2o3bdq0YPuMlEqSt9Jk1TQZx2e9sQTMlWqv\nu+66qB0vxOyvB5YJt9pqq2CzXA3EC7xef/31wfZSx0MPPRRsropfqkzjpeymLmkxWb8lbdvYsWNT\nP8NjFIiln7/97W/B9tIiZ0hOmDAhdf/8/ONr9Zvf/GbUjiXOG264Idg333xz1G7jjTcOts/w7d69\ne7D5vjF+/PioHT9nspCnRwghhBC5QJMeIYQQQuQCTXqEEEIIkQsqNqaHU1WzYjO4KqNPTy0VjjNh\nLZxjfYC4+mxW5ea8wKtUp62WDsSVMn06O5cBYJ3YxygwvM3HA3C/+Fgw5v777w/2eeedF2wfN+Cr\nfjYn+vXrF72+4oorgr3ffvsF268uf9999wXbpyVvscUWwea+ueOOO6J2HG/HcSVc0RmI4xb4Glu0\naFHUjqs6MwcddFD0es899wy2//2VSKnp3L6ExuTJk4P9xz/+MdjbbLNN1O6oo44KNlfkBuJyEhzX\nNW7cuKjdX/7yl2BzbAbHdwFxrBWvxv3zn/88anfooYcG24/HPMKrE3AMJJeYAOL77K9//etoG8dr\nceyej+PiZy3fq7Mq33PcIz8TAOA73/lOsLlfX3nllagd3wP8yuz77LNPsPl+c9ddd0XtSr1W5OkR\nQgghRC7QpEcIIYQQuaBi5S2WJnwVX3bBsevvmmuuSd0fu+O8i5hdrZyy59367OLjdnmFUym54rF3\ncXI1T3Z/+7bsJuXUcyDuM7azKmNnpdGzpMnp1FyFFGjeMqZftHPu3LnB5vO6bNmyqB2PP+/2ZomS\n9+HlqFmzZgWb3fL+2uF+4xR4rhYMxNLzdtttF2wvsfBvnj59erTN931jUj226lr5naUqLrVR10VW\neQFatj3z588P9v/8z/9E26ZOnRpslo1ZVvX76Ny5c7D94rZ8bWTJL2nXEADstddeq/+IMvLZZ5+F\na44XywTisgt87/NSLR8z9x9LtUC80oBPN+d7Jt+r/fXF55HPN0ti/hh5zPvxy9v4me5l1l133TXY\nvnI3HwcvYMohMEB8T8lCnh4hhBBC5AJNeoQQQgiRCypW3uKsHl8Vl12j7LrMysbIyvhhdzi7Vr0U\n88477wQ7bfG85ozvBz6n3F9ZlTH9ooYbbbRRsLnqspe30hal9Jl9LLlxNL+vCPzWW28FO0uqzJO8\nxdkafL7vvvvuqN3vfve7YPPYAeKMHz53LDUCwPe+971gT5kypcZjAOLxeOCBBwabK7MCsQv8nHPO\nqXHfQHz9+WuMM2P4dzQ0n3/+ebgmWXIE4vPICzH7exovJMmSwwsvvBC149/sxzdfAzzWvXTC0gdL\nJ3379o3a7bvvvsHm6tqc8QfEEgZn7/n7Mfdf1qK4vI1/L7D64tPlZtmyZaESMVcDB+J7JuMX0+WM\nJX4GrVy5MmrHchnfV4F4HM2cOTPY/t7H54elqax7P+N/E1+jw4YNC/ZLL70UteNK3lnP56yq/b17\n967xmDzy9AghhBAiF2jSI4QQQohcoEmPEEIIIXJBxcb0DBgwINh+NVXWF1kb9it4M1kr+XLV1muv\nvTbYPv6Eteu6pn42ZbKqJLOW7+MQGNZ4gVgD5v37SsicWplVfoC3sa7PqwwDcSqzj21hWEPOWsm6\nKTJp0qToNV/TnObsq6dyzAGv4A3Eqajch2PGjInaDRkyJNh8vfiYCz6O3XffPdi+KjDHb/GqzD6m\nh68Dv5ozx0s0ZkxPixYtQnqvj6XgVH2Ox/DjgO+fvKq2h+N9fEVbjgvheJHvfve7UTsu+cEp5nXl\ntNNOCzbHB/pYl6xK/RzrmbZiPVD//bzpppviyCOPBLD6M4PLOHBZFp+yzvGHHN/DpUD8No7hAeLK\ny3we/XXD++D4Mb6egDhdnsfoqFGjonaPP/44SoF/s3/uMnxN1rV6vjw9QgghhMgFmvQIIYQQIhdU\nrLzFC+H9/e9/j7axe47dn97VzosmZrk4ObWSFzvzrj/eR9oCh3mCzw+7QnmxQ4+Xt9Iq+Pp04jQp\nybu8+Ziy+pyrg3r5hkmTy7KOqang07533nnnYHNaK1dLBeJyDTNmzIi2ff7558FOq4oLxFIhXzss\nMfl2fP59CQN2dXPf+vIJ7Kb3Y5glnMakRYsWQXbxC6bmDZbNmyItW7YMKdc9evSItqVJgV5GZ4lz\n3rx5wfaV0kePHh3sE088MdrG1ca5knG5F3Q95JBDotf/+c9/gj1o0KBg+2cr30v9QtF872CZzi9O\nnCWLMfL0CCGEECIXaNIjhBBCiFxQsfJWltTBUdrc7rbbbovasbzlZRCmffv2weYMrYULF6Z+r68c\nmwe8pMDyBWc5+YUBGV9Vld2VLI14Fy+7YbOyNvh6YOnMX0Ps4k2r9gyULpc1RXgBSCCuaMrbfOYb\nu5V9pWt22bN85BcI5cwjzjTxFWLZZb106dLU/XF/9unTJ9h8TQFx9V8/vjmDhrNYhKgrLVq0CNeS\nl1qffPLJYPM9zd+rOMOsf//+wfZyzplnnhnsXr16Rdt4HHDWYlaYBt8/2QbieyE/W/29grNzubo2\nS11AfA/OWgSVf7PP1vILkKYhT48QQgghcoEmPUIIIYTIBZr0CCGEECIXVGxMD+O1O9YXObbGr/5b\nFzg90FesZd3Ra5x5wMdHcEwP67pZ58bHSnAMFcfP+D7nyrHcjmOJgLiP+Ph86isfh48PYfh3NbcV\n1x955JHoNev0f/rTn4K9//77R+2GDh0abJ96uv322webK87uuOOOUTteOZnPq78+ONaB4wB83Bin\n0XPa+7nnnhu14/IEPh7pl7/8ZbB79uwJIcoJl0Op6XU1r732WvSa76dcPd5XL+ex6Mtw8P2T74U+\nfojvp5w67qtJ832W7xt8PweADh061Phd/l7K++DYOg/HhPpj2mqrrVI/x8jTI4QQQohcoEmPEEII\nIXJBk5C3vvGNb0Sv77jjjmCzi6sc1SXZre3dbOwizFp8s7niU7a9tFSNr5TJi8L6z3B6IktJvsQA\nv+Zzn5VGnrXI7Lbbbhvsl19+ObVdc5a3fv/730evuUIzy4vebcxudZ/uz3Izp9r6xYA5tZXPKy+u\nCMRpvnzteGmAZWmWYU855ZSoHVeX9v3pK08L0Rhw6Ygs/CKgojTk6RFCCCFELtCkRwghhBC5oEnI\nW1xpEgDuvffeYLOE4aPZeXE2X6EyDY4O99Uq2R3O2SJ5oVR5y2dDcRVcv480ydBngKVVf/YSBW/L\nyiLjzAT+Xi9bsvyWVQm6KcLjA4ilKT4P22yzTdSOK8mOGjUq2saLzbJUdcstt0TtWDrmLK85c+ZE\n7Vi24v35atLvvvtusLkSu1/AlKs6+2xEvn9w1okQovkgT48QQgghcoEmPUIIIYTIBZr0CCGEECIX\nNImYHr9yK6fCcmqt1+i5QnOpMT1Zq3lz/In/rjyStjq5j6XhFEy/gi6vwsvxOL4db8s69/5zaWy0\n0UY1Hq9fuZhT1ptbn/MK90Ac/8L2sGHDonZcdZnLEQBx2ve0adOCzfFCAHD00UcHe9asWTXuG4hj\ni773ve+lHhNXaD7ggANq3DcQ3y/8789jlXUh8oY8PUIIIYTIBZr0CCGEECIXVKy8xanNPjV63333\nDfZ9990XbF+R+cEHHww2u9Oz4FRmXx2Wj6m5VectBU4fBtKlpAULFkSvd9lll2DPnz8/2sbVm1u2\nbBlsXxKApTSWPLwEye3S5Df/XStXrqxx38DqlaGbE74kA6eO86KHrVq1ito99thjwfbni/uDFx/s\n169f6nHw/n2VWU6rZ1m7Y8eOUTtORedrisczEJdT8L/fX99CiOaHPD1CCCGEyAWa9AghhBAiF2jS\nI4QQQohcULEBCxwz42NHDjrooGDzkhQcpwEAb775Zq2/t02bNsH2KcocZ8IpsnnBx8+kLVvgYyM4\nvdgvQ8FxWLwPf345NoOvDZ92zHEavESJPyZOjeYVwDmuBYiXYMiKEWqK+PiZnXfeOdhz584N9nrr\nrRe145XPfRwdx0eNGzcu2O3bt4/aPfHEE8HmNHJfWmL8+PHB5lg+308cR9anT59gjxgxImo3e/bs\nYG+yySbRNr+avBCi+SFPjxBCCCFygSY9QgghhMgFFStvsTTh4aqvXK3Zr7LOKbNcHXbQoEGp+2aX\nt6/Oy25+Tp/NC750AL/m9H4vCx555JH1e2DEZpttVlI7ltxYXnnqqaeidiwBeSmtqdO9e/foNa+e\nzqndfixOnz492F26dIm28Zhhyaldu3apx8Gyqa+KzK9ZuvRjk+UullC54jcQp7b7Su++TIIQovkh\nT48QQgghcoEmPUIIIYTIBRUrb3kpJQ120U+dOjXaxnLUf//732BnyVvsQs9agJDd5Hlh4cKF0WvO\n1GFp8de//nVDHVJZ+OlPfxrsLbfcMtrGEqmvwt3U5RCfvXXdddcFmxfr9Xz/+98P9osvvhht40xL\nlg297Pj6668Hm8epl634NctsWZmVffv2DTZLcf51z549o22l3nOEEE0XeXqEEEIIkQs06RFCCCFE\nLtCkRwghhBC5oGJjekrlwgsvDDZX1gXiWAFfmTWNo446KtidOnWKtnGa+t57712bw2wW+BWruUIx\np/rvscceJe+T04sbK6biiCOOCLavMOxXEW9O+BXkv/3tbwfbjyWmf//+Ndqek08+OdhDhw6NtvG1\nw2nvPs6mc+fOweaV2n27Qw45pMZj8N/LcUHdunWLtimmR4jmjzw9QgghhMgFmvQIIYQQIheYXwAy\ns7HZOwAWrrGhKCc9kiTpUO6dqi8bDfVn80F92bwoe3+qLxuN1L6s1aRHCCGEEKKpInlLCCGEELlA\nkx4hhBBC5IKKnvSYYTMzTC3+W2KGxfR6/TXvQTQlzLC5Ge40w+tmmG2GR83Qp5b7aGuGM+rrGEVp\nmOFCM8wyw/TieN3JDAvM0L6Gtoea4ecp+9nDDLvU/xELoOZ+K8M+nzHDsLVtI0qjPvqQ9r2HGR4p\n1/4ag4qu05MkeBfAYAAwwyUAPkwS/L56uxnWTRJ82VDHY4YWSYLmW7ilETGDAbgfwK1JgqOL7w0G\n0AnA3Frsqi2AMwCMLPMhihIxw3AABwPYPknwWXGik/pHSpLgIQAP1bCfdQHsAeBDAC/Uz9GKamrb\nb6LyqOQ+bOjndRoV7empCTPcYoY/mOFpAFeaYbAZXizOau83w6bFduEvBzO0N8OCor2dGSYUZ8DT\nzbB18f3j6P2bzNCi+P6HZrjUDOMBDG+UH50P9gTwRZLgxuo3kgRTATxnhqvNMNMMM8xwFACYobUZ\nnjTD5OL7hxU/9jsAWxX78eoG/xUCADoDqEoSfAYASYKqJMFbxW0/oT7rCwBmONEM1xdtHt93ATgd\nwDnF/tytEX5Lnqix38xwkRleKo7BPxf/QKm+x15ZvG/Ore4fM7Qsemynm+EuAC2rv8AMN5hhYtET\n8ZvG+JHNnLQ+XGCG39Qw9jYyw83F/p1SfR81Q08zjC22n1yTt9UMOxQ/08sMQ80wxgyTzPCYGToX\n2zxjhsvNMAbAT/0+GoMmN+kp0gfAPkmC8wD8A8AFSYKBAGYAuHgNnz0dwJ+SBIMBDAPwphm2BXAU\ngG8U3/8KwLHF9hsBmJkk2ClJ8FzZf4mopj+ASTW8/20UvH2DAOwD4OrigPoUwOFJgu1RmDBdU7wZ\n/xzA60mCwUmC8xvkyIXncQDdig/CkWbgcuhVxT67AcDPUj5fPb6PAHAjgP8t9ufY+j3s3JPWb9cn\nCXZIEvRHYQJzMH1m3STBjgDOxqp7748AfFy8J/8WAJfFvjBJMAzAQAAjzDCwHn9PHqnt2LsQwFNJ\ngh1QuI9ebYaNACwDsG+x/VEAruUvKU6CbgRwGIBFAK4DcGSSYCiAm1Ho92raJglGJAmuKfePrQsV\nLW9lcE+S4CsztEHhhI4pvn8rgHvW8NlxAC40wxYARiUJXjXD3igMzJesUIm+JQqdDhQmQPeV+weI\nktkVwL+KsuLS4l8MOwAYDeByM+wO4GsAXVGQwkQjkyT40AxDAeyGwo30LlsVszOq+P8kFCa0NXGP\nZOSGJ6PfPjDD/wPQCkA7ALMAPFz8GPdnz6K9O4oPySTBdDNMp6/5rhl+iMKzpzOAfkC0XawFdRh7\n+wE41CxMgjYE0B3AWwCuNwtOAI6t3BbAnwHsV/Qi9Ufhj9b/Fp+fLQC8Te3vKtsPLANNddLzUQlt\nvsQqT9aG1W8mCe4oSlXfBPCYGU4BYCjEkvyihv18qhtwgzALwJE1vJ+2INKxADoAGJok+KIoX26Y\n0lY0MMUx8wyAZ8wwA8AJxU2fFf//Cun3n1LGt6gHaui301DwygxLEiyyQmwlj7O0/lytAJwZtkTB\nw7BDkmC5GW6BxmzZqeXYMwBHJAle4X0U+3kpCh72dVDwrFfzNgr9NgSFyZEBmJUkqeEfFTWem6q8\nBQBIEqwEsJy0/uOB4PVZgFVu1fAwNUMvAPOSBNeiEDw5EMCTAI40Q8dim3Zm6FH/v0AQTwHYwAyn\nVr9hhh0ALAdwlBlamKEDCn9FTgDQBsCy4oRnTyD01wcANm7YQxeMGbaxYqxckcGoe1Va9WcDkdJv\n1Q/DKjO0Rs1/mHieRTE8oOgFqJawNkHhAbjSDJ0AHFiO4xarqMPYewyFOLvqOK0hxffbAHg7SfA1\nCs/VFvSZFSg4DS43wx4oXCMdrBBEDTOsZ4bt1v7X1A9N1dPDnADgRjO0AjAPwEnF938P4G4zHI/C\nA7WaowAcZ4YvACwBcGmS4D0z/ArA42ZYB8AXAH4MlQ9vMJIEiRkOB/DHojv2UxQmrmcDaA1gGgp/\nPf6/JMESM9wO4GEzTAQwFcDLxf28a4bnzTATwGjF9TQKrQFcZ4a2KHhcXwPwQ8SxIKXyMIB7iwGW\nP1FcT72S1m8rUIiXXADgpRL2cwOAvxdlrako/JGCJME0M0xBwas7D8DzZT16AdR+7F0G4I8Aphcn\nPguKbUcCuM8M3wHwNJy3Jkmw1AyHoBBmcDIKk+FriyEn6xb3OauMv6tsaBkKIYQQQuSCJi1vCSGE\nEEKUiiY9QgghhMgFmvQIIYQQIhdo0iOEEEKIXKBJjxBCCCFygSY9QgghhMgFtarT0759+6Rnz571\ndCh1Y9GiRcH+5JNPom3t2rUL9tdffx1ss7jI7/Lly4PdqdOqlQzatGlTtuOsKwsWLEBVVVVaVeI6\nU4l9mQcmTZpUlSRJh3LvV/3Z8ORpbH722WfB3mCDDdZ6f3yvbtmyZUbLhqM+xmYl9iVTVVUVvf7y\ny5oXQV9nndg/sv76qxZub9u2bdmPa23J6staTXp69uyJiRMnlueoysRPf7pq4dYZM2ZE244//vhg\nf/jhh8Fed934Z48aNSrYvL+DDy6tlhpPqIDVL5C1YdiwYWXbF1OJfZkHzKxeCl6qPxue5j42v/pq\n1eo7CxYsCPZWW21V688DQIsWq4r68r26f//+UTv/R2lDUR9js1L6Mo2//OUv0esVK1YEmydArVu3\njtptscUWwT788MPr5+DWgqy+lLwlhBBCiFzQJJaheOaZZ6LXI0eODDa7Wt97772o3VlnnRVs/iuj\nVatWUbudd9452HfffXewH3rooajd7373u2CzdFZOz44QQlQCX3zxRbA5jCDL08MV/vme63nrrbeC\nPWDAgLoeYrPEr5KQ5vny7dgzs95660Xb2OvGSoeXKtO+y7/P8uQBBxwQ7NGjR9f4eX98Xm1pSPS0\nFkIIIUQu0KRHCCGEELlAkx4hhBBC5IKKiel55ZVXotdXXnllsOfOnRttGzhwYLDnzJkTbJ/62L59\n+2Bzap7PFuCU9Sy98+yzzw527969g3366adH7Tp27AghhGjKbLjhhsH+61//Gmyfojx48OBgZ2Ve\nPfjgg8H+05/+FOz9999/LY6y+ZEV08OZwj6W1MfxMGeeeWaw+bnWuXPnqB2non/66afB/vzzz6N2\nG2+8cbCnTp2a+r0MP1uzMvvqG3l6hBBCCJELNOkRQgghRC6od3kry411ww03BPvFF1+M2m200UbB\n3nHHHaNtXCiJXXAvv/xy1I7lLpac/DG99NJLwf7BD34Q7E033TRq9/777wf77bffDvZpp50Wtbvx\nxhuDzRWegWz3pBBCVAqcsj527Nhg8/0SiMMNTjrppGBfeumlUTu+V/sQA7EK/1zgfsiSsB599NFg\n//73v4+2vf7668HmcitejuzatWuwuayAf2by51iO83LZ+eefH2wOD2lIOcujp64QQgghcoEmPUII\nIYTIBfUub2W5sXj9lc033zz1c756I2dbHXroocGePXt21I4lqGuuuSbY3u2633771fi97I4F4krO\nm2yySbD92lt33HFHsM8555xomyQtIURTgKUUvj/7RSk5rODHP/5xsDn7C4jDBTp0KPuau80G/zxJ\nk7SOOeaY6DWvJuDXyuJnF0tTvCYlED8zGb+YN4eOsPTFC9MCwIUXXhjsq6++OtjXXXdd1O7II48M\ntr++yl29WU9gIYQQQuQCTXqEEEIIkQs06RFCCCFELmjwiswcj8MxM17j5XZe4+NqkO+8806w99hj\nj6jd0qVLg81655Zbbhm169u3b7A/+uijYPsqlJw6yJqmj0d68803g92YlSeFEKIc8P1u8eLF0Ta+\nH3O1Zl/Rnu/3XJJElM7TTz8d7AceeCDa1qNHj2DzswpY/RlajX/GLViwINj9+vULto/VWbFiRbA5\ndsvHcXE/8zGdfPLJUTuu6s2rHQBxSnxWxe9SkadHCCGEELlAkx4hhBBC5IIGl7fmz59f4/s+PZzd\naV4S4nS8N954I9hcMRmIq0OypLVkyZKoHbv02FXrqymza41lqw8++CBqx79l5cqV0TauhimEEE0B\nljq4ui+QnlLt3+f7Ilf+9ZRbzmhqZJU1uemmm4Ltn4ssYfmwCj6nnBLv+4hfc0VmL1Wm9ZFfLJWP\nifftfyOXdnn44YejbeW+BuTpEUIIIUQu0KRHCCGEELmgweUtjvxnd6eXnDgjystWc+bMCTZHkftq\nkpxxwO2mTJkStWvfvn2wOZNr0aJFUTt2GXIlS5+9xfhFUHfZZZfUtkII0Vh4aYJlBc7C8RVy06QO\nHx7w7rvvpn6XSIfP1XPPPRdsrrIMxNlRXhLifXA7L1uxZMYyGGc1A/Gzlfed1a8sdfGKBgDw7LPP\nBptXagCAAQMGpO6zLsjTI4QQQohcoEmPEEIIIXKBJj1CCCGEyAWNGtPDeqLXDFn/22yzzaJtCxcu\nDDZXbvbVIHn/HTt2DPa2224bteNUOt6HT6Pv06dPsJ944olg+xVtOUZo1qxZ0TbF9JQXryFzXFeX\nLl2C7a+vP/zhD8E+88wzg+0rxa6//vqp380xXqq0LZo6WanBr732WrCzUqq51Igv5cH3cb6H1+Y4\n8shdd90V7Pfeey/YPi6GY3D8OWzTpk2wP/7442D7ys18T+N7oV/5nfuZn5lZsURZ7/Pra665Jtp2\nyy231LiPuiJPjxBCCCFygSY9QgghhMgFDS5vsWTEad/sPgWATz75JNg9e/aMtrGblKUpTokEYumL\nXXre7dqrV68a9+clC66uPG7cuGD3798/arfffvsF2/8uURpZ6bPz5s0L9tlnnx21O/3004M9efLk\nYP/0pz+N2rHL+N///new77jjjqjdwQcfHGxfEoFTRn/4wx8G28uxeU3PHTlyZLBnzpyZui2LvFfn\nrRR4ocvu3btH2/ie6WUQhvvPl/IQ6bzwwgvB5meSl6YYL8vz85Q/5ysyc1gJLx7r4XHJkpiXPtPG\nr18AlX/X2LFjU7+3HMjTI4QQQohcoEmPEEIIIXKBJj1CCCGEyAUNHtPDS0pwGjHH9wDxquh+21Zb\nbRVsTkufMGFC1O6dd94JNq8S7PfHGidrn77MNx/T3/72t2BfeOGFUTuOH/Kp0qI0suI3OAbroYce\nSm03atSoYO+7777RNi4lwOmX3bp1i9qNGTMm2L4kAuNL8zcmfA0Dccn4rHZZ6flMVnr+I488Emxe\npZlLRgDA97///WD/9re/DbY//2nXgV9FutTjE6Xz6quvBrtDhw7B9ksXMJwa7fuOX/v4OJEOxyZy\nzIyPFeTx6889x9LyfczH1qT1kd9f2r3i888/T23H3+WPna8p/9wtN/L0CCGEECIXaNIjhBBCiFzQ\n4D75BQsWBJtdWt4lfeyxxwb7d7/7XbSN0+zY3edXY+cU9mXLlgV72rRpUbuBAwcGO80dB8Sp7pxG\n791xLJ/lNV25PnnqqaeC/frrr0fbOJ2WK3n6KtwPPPBAsLnyqHfjsot31113jbbxdz/88MPBPu64\n46J2WVJMfcDSERBXnB4xYkSw02SvtYFT0Xfcccdge3f4FltsEWwuH+BlsMMPPzzYG2+8cbD9/YLP\ncV3HnFLiY1hWYdnCn6e0FGhf0Z7v1W+++WbZjrO5w/cZvu79dc7lAnzqOMvvWTITt+N9+PR4/2xM\n+960dv6eyN/rw0/KjTw9QgghhMgFmvQIIYQQIhc0uLzFcgEvzLlixYqoHWeWbL311tE2dplxZU8f\nOc6ZBCyrcVYJAHzjG9+o8TN+UTx2r3NVYF/hmaPjvauOM7vqO0p9TaTJAOy+5ja+2mqpWTLcX95N\nmrYPL4dcccUVweZz7zOqNt9882DfdNNNwd5hhx2idnzu99prr2C3a9cuavfcc88Fm7MBgVgyu+++\n+4Lt5a2GyOz6+uuvQ6aglw44w42vP19JnH87nx/f72+88Uaw//73v0fb+Pzz+Gb5DwAOO+ywYPPY\nf/TRR6N2PL45a89n4/Xo0QO1xbvY0+SBvGaDjR8/Pth8Pvx543tEVnYR31c6d+4cbF+1vnfv3nU8\n4ubJ0qVLg81jqlTJCYj7hfvBS5W8j6wFTHl/3M5XeOZjLFU+5mc1EIet+EVW64I8PUIIIYTIBZr0\nCCGEECIXaNIjhBBCiFxQ78EGPs6GX7Mu6ONb+LWPi+EYANbyfVwQp6nzPoYMGRK1S1v53ccJsLbY\nunXrYPs4kKqqqmBzjAMALFmyJNgco9AYlKKxZrUpJSYIiGNaSo1v4XRzINZ5BwwYEGxf8ZpXOOe4\nAR/ncsYZZwSbNfO+fftG7fbZZ59gez355JNPDjZfa//85z+jdj7Gpz745JNPVlvJvBo+R7fffnuw\nuVQDEMdRse1jLmbMmBFsP7532223YHPK8/777x+14/HN33XAAQdE7fi8zp07N9jjxo2L2nF81Xbb\nbRfsYcOGRe24srCP1clr7E4aXLWc79U+3o7vmVkrafM2jvXg0iKAYno8fJ/la9SPPT6/Pu4qK46H\n4fgcjh/iWED/mo/Jx/8xfExZ7TyvvPJKsH1sZl2Qp0cIIYQQuUCTHiGEEELkgnqXt7xrnKsws4tz\n5cqVUTuWJrwkwm5Trirr98Fu0z333DPY7CYHYjmK8ZIbu/H4GDiV3b/2rluf3t6Y1LZybV3d/3yu\nbrzxxmjblClTgs3u1BNPPDFqx2nld9xxR7Bnz54dteNrapdddkk9pv/7v/8L9jnnnFPj8QCxpMWl\nDYC4+jPbEydOTP3e+uKrr74K8u57770XbePxw2Pk/vvvj9ptuummwebz6K/v4cOHB7tPnz7RNpZB\nOCXejzF2j7M87I+dx2Da+QZi6Xns2LHBfumll1L317Zt22gby9lcGdpLnlkLbjYnWFLOqlTPr/la\n8/JL2md4YVMA2GmnnWp9rM2JxYsXp25LKydSLtJKlPg+5/uDr8KcBn/GS6RZv2X+/PnBlrwlhBBC\nCFEimvQIIYQQIhfUu7zlM6rS5C2fScJZTz7zhjOnOHPAfxe7Anl/3p3Kx8FuNr+AKbv4OAvEu/e4\nmjQfK7C6BNeY1HaBRZ8RwHIXL+Dp+4slI5+Jd8IJJwR7zJgxwfYLhHIVZq6M7Bc19AtWpsG/nTOE\n/G9kGcZnlO23337B5n720gtXMK4v1llnnbBwqpdvTzrppGDzQrleSuJzydKPr3rN7aZPn556THxO\nvFTM0gdnNHq3N8uL/Bk/Nlka9dmUacfO/Q7Eldr5eP/nf/4naledjefvN80Nvm632WabYPtKwAyP\nKx+WkFbxmrMBRZytlIXPgCpVZsqC7+kcmuFDG/iewMeRdUz8/PTjPCubi1dxKAfy9AghhBAiF2jS\nI4QQQohcoEmPEEIIIXJBvcf0eN08Ld3cVy5m3djHbXTq1CnYn332WbC9hsztnnrqqWD7NGeujMxp\nu/57+XhZ4/T6ZFrlUSCO96kUstIFWWvNSlmfOnVqsP1545V3zz///GgbV8eujkkBgDlz5kTtOIaK\nY4T8sXM15NNPPz31eBnuo4ULF0bbOCXbx51xyvfxxx8f7MGDB0ftGiJmYcWKFWE1dS73AMSxKhwL\n4yuCcywSj1P+PBCPOR8D5Y+pGh/LxtcEx2FlxfQwPkaIxzofk0+b5lgEH6vE54Z/s499+8Mf/gAg\nruTdHPB9yeOYYzOyUtGzVubm64bvJRzTJeL4xSz8847vhVmrome1Yzj+1l8b3M+lfm/aSu9+fx6O\n4SwH8vQIIYQQIhdo0iOEEEKIXFDv8pavxMrVXVlW2HLLLaN2LG/4NGd2k7IstmjRoqgdS0mcxsoy\nFRDLKrzNV6FMq8TqJSxu5+UXL/00JtXHliVRsAvVpwm//vrrwWZZiBcEBWI58YILLoi23X333TXu\nv1u3blE7Xkj06aefDrav0MlyKkuaXNHZM2jQoGB7yeKoo44K9iGHHBJtO/DAA4P9ve99L9i+6nZD\n9Plnn30Wqp9vtdVW0TaujMyLkvrSAmkp21muZ7+NpQ+2vTubxwi7r/0Y4/HIkpiH7zN8TL4vWHLz\n2zjFnvvMl7io3keWzNMU8dIuw9eDX+SX+yVN9vCvWVpsiJIOTQkfEpKGH1N8H69r9fy0isy+L9MW\nMPXPEh7PLG/5Z2bW2PalNdYWeXqEEEIIkQs06RFCCCFELqh3ectnK7Fbk7dxRVUgdvG1adMm2sZu\naHZX+2h2dsOyDJZViZYzCfyChGmLhZa6AB9QWS7xarekP8Y0vIvzwQcfDDZXEfVyDmd2sbwCxFU/\nWeaozkSq5uyzzw72M888E+zf/OY3UTvuv8suuyzYXt7ibKKsKs5ZmSV8TIxftNRnfdUH66yzTpAg\nXnzxxWgbX3M8/vy1yNWnOWvKj02Wm7OkUXax+2uMX7Pb22dvMewC95XO+X7Bv8NLsnxtehc7HxOP\nYX89V19zF198ceqxNkVefvnl1G1Z0gT3Gbfz1wbfP/hcZy2wmUc4bMDD59dLWPw8zZKLsmBJq0uX\nLsH2C2fzmMiqws33GM6M9vvj4/X7UPaWEEIIIUQd0KRHCCGEELlAkx4hhBBC5IJ6j+nxsB7MurmP\ni5k1a1aw/eqx/JpjenwKH2uI/L1e7+S0WE6x85pp2srQvrIt4/VJjjdoTD766COMGzcOAHDjjTdG\n2zhmIy32wm/jUgQ+5ZJjo/yKuRx/Mnr06GBzWQIPx2dlrVrPuvFOO+0UbWP9e9999w02XzMAcOed\ndwb7pz/9abRt6623Dvb2228fbJ/6+6c//Sn1GMtF9+7dcd111wWb4XINnNrtY3o4NoOvUx8Dx33t\nxy1fI3x9+NR2Pv88bn1MDx9j2r6ztvkV17NWj+e2vKo4Xx/MtddeW+P7TZVSY2t8nzNZac58D+Y+\nT4uVzCu+zAtfz3wO/ZjidllV9nmbb8fjiO/VWWUrssYl35/33HPPYP/73/+O2vE15Z/PPv5nbZGn\nRwghhBC5QJMeIYQQQuSCepe3vOSUlnbqF5jcZZddgt23b99oG0tLLEH51DZ2tXH6pE+lZJchu+O8\n649d7+zu8ymcfEzehZ7lGm5IWrZsGVKpTznllGgbn8fly5cH25cf4Necwujb8fn41a9+FW3j88hp\n0lyBGYjTwFmiOO+886J2vDBplgz229/+Nthcmdgv1snXgN/GsihX9fbXa0P0eYsWLYI0d/nll9f7\n94nmh5eZSpVV+PrOWnCUYUmkUu6JlYLvB35msuzco0ePqB3LzuPHj4+2de3aNdgcOpDVR1nbGO5z\nHwLhV1OoxocRsITlJbKsshh1QZ4eIYQQQuQCTXqEEEIIkQvqXd7yEhG7rth95rMsfvSjHwV73rx5\n0bbJkycHm+WMGTNmRO14oUvev3eXsQzCrkRegBEAvv/97wd75513DraXTvxxMN7911iss846QZLZ\nbbfdGvloGh7OFBNCrJ69lbaQqL9/pskgWVm3vD+fqZmVaZsHvLzFMjpnUg4ePDhqx89WX5Wdn8NZ\nshW3y5Id0xYc9fvm/bGk1adPn6jdE088EWxfAT4rc6wuVMYTWAghhBCintGkRwghhBC5QJMeIYQQ\nQuSCBk9ZZ1gb3nXXXVPb+YrHaRWQR4wYkbqPLA2ZNdO6wHFFQPZvLrc+KYQQ5cCvSM/V6bMq/HJJ\niqzYEY7pyYpt5LiVTp06ZRxx88THTKXFNXGFYyBexcBTaiwp9xmnwPuVBOpSZoDLkPi4HY7p8cea\nde3VBXl6hBBCCJELNOkRQgghRC6od3mLXaRAuvSTlZro3X2lulMZdpnVVc5K+y52A/rj83KWX+RR\nCCEqAV89t9R08bQFLP2CzWn78xXteRHpPMpb/pmZVpH4sMMOi15PnTo1dZ9pFbW9lJS2YKx/bvHn\nuF3WQtG8osHuu+8ebbviiiuC7Z/jXKm/HMjTI4QQQohcoEmPEEIIIXKBJj1CCCGEyAX1HtNTVVUV\nvWZtkTVfv7JqqbD+51PbSl0ltlRY7+Rj9zE9rGv6bXksqy6EqHw+/fTT6HX1MjVAHFfiY0z4vsip\nzD42he+ZHCOy5ZZbZh5H3uDYF0/r1q2D7dO+P/roo2D7Zx/3UallU3g5DB/7w9dA1jIUDMfm+GuD\nrxt/fFplXQghhBCiDmjSI4QQQohcUO/yVlZ1SXZpde7cea2/q1Q5K0sGy0qBT5O3fAo8S3i+cqWX\nu4QQohJ4/vnno9dp9yp/v+PXLM14KT/tPuvlrFdeeSXYgwYNWtNhNztYVgTiUgJZ0h+fby8fpVXD\n9mUFuI/4eeefhfya9+fDVDbccMNgc8VvX/2b8cfOlZzLgTw9QgghhMgFmvQIIYQQIhfUu7zlo745\nIpwrb2ZFaPtobu+Sqy1ZMlhdMr68S49/i3fxetelEEJUAqeffnr0mqvkcrYV38MB4O233w52u3bt\ngu0rLbP0lbWY5aabblqbw252PProo9FrzoD+5JNPUj/32muvlbT/rEw8lh35ueafi/wM5hCOrMVB\np0+fHuxf//rXqd9b38jTI4QQQohcoEmPEEIIIXKBJj1CCCGEyAX1HtNz0kknRa8nTZoUbI7pGTp0\naOo+6lqtudz4+KRqfLo9v/bH3rZt27IflxBCrC2XXnpp9HrAgAHBnj17drB9XEmfPn2CPXjw4GD7\nWJ1WrVoFm9PSjznmmLodcE7wlZfT4JgpThUH4thStn3cFcfW8D6yYn8Y345jt/r27Zt67A2JPD1C\nCCGEyAWa9AghhBAiF1htUsXM7B0AC+vvcEQN9EiSpEO5d6q+bDTUn80H9WXzouz9qb5sNFL7slaT\nHiGEEEKIporkLSGEEELkAk16hBBCCJELmvykxwybm+FOM7xuhtlmeNQMfdb8yWgfbc1wRn0do1gd\nM1xohllmmG6GqWbYqQz7fMYMw9a2jaglZhfCbBbMpsNsKszWui9h9gzMsvuplDaizpjhq+LYnGWG\naWY416zpPzPyTt6fmZVRAKeOmMEA3A/g1iTB0cX3BgPoBGBuLXbVFsAZAEaW+RBFDZhhOICDAWyf\nJPjMDO0BrL+Gj4lKxCz0JZLkM5ipL5sPnyQJBgOAGToCuANAGwAXcyMzrJsk+HL1j4tKQ8/Mpu/p\n2RPAF0mCG6vfSBJMBfCcGa42w0wzzDDDUQBghtZmeNIMk4vvH1b82O8AbFX8q+bqBv8V+aMzgKok\nwWcAkCSoShK8ZYaLzPBSsd/+XByg1d6ZK80wwQxzzbBb8f2Wxb9YppvhLgAtq7/ADDeYYWLxr9Tf\nNMaPzAmdAVQhST4DACRJFZLkLZhdBLOXYDYTZn8OKxYWvDNXwmwCzObCbLfi+y1hdmfRWxT1Jcxu\ngNnEojdJfdkIJAmWAfghgDPNYGY40Qz3mOFhAI+bYSMz3Fwcv1Oq761m2K44bqcWx+nWxbb/LnqP\nZlbfn0WDoGdmkiRN9h+QnAUk/1vD+0cAyX+BpAWQdAKSN4CkM5CsCySbFNu0B5LXgMSApCeQzGzs\n35OXf0DSGkimAslcIBkJJCOK77ejNrcBySFF+xkguaZoHwQkTxTtc4Hk5qI9EEi+BJJhvK/iNfAM\nkAykfQ1r7HPQbP4BrRNgagLMTYCRCTCi+H47anNbAhxStJ9JgGuK9kEJ8ETRPjcBbi7aAxPgywQY\nFu0LaFH8/EDal/qynv4ByYc1vLe8eE89EUjepHF2OZAcV7TbFsf2RkByHZAcW3x/fSBpWbw//4X2\n2aaxf2te/umZmTR5T08auwL4V5LgqyTBUgBjAOwAwABcbobpAJ4A0BUFt55oQJIEHwIYisJfju8A\nuMsMJwLY0wzjzTADwF4AtqOPjSr+PwlAz6K9O4B/Fvc5HcB0av9dM0wGMKW4n3718mPyTpKs1pcw\nOxHAnjAbD7Na9yWSZLW+hJn6sjIwsv+bJHivaO8H4OdmmArgGQAbAugOYByAX5rhAgA9kgSfAJgB\nYJ+i93a3JMHKBjt6kUZunplNOqYHwCwAR9bwvtXwHgAcC6ADgKFJgi/MsACFwSkamCTBVyjcHJ8p\nTnJOAzAQwLAkwSIzXIK4bz4r/v8V4ut2tUJTZtgSwM8A7JAkWG6GW6B+rj+SJPRlcZIT+hJJsghm\nl6COfQmz0JdIkuUwuwXqy0bBDL1Q6LNlxbc+4s0AjkgSvOI+NscM4wF8E8BjZjglSfCUGYYCOAjA\nFWZ4PElwKURDkPtnZlP39DwFYAMznFr9hhl2ALAcwFFmaGGGDij8FTkBhSC8ZcXO2xNAj+LHPgCw\nMUSDYIZtzLA1vTUYCDfLKjO0Rs0D0/MsCoMSZuiPwoMWADZB4Ya80gydABxYjuMWNWC2DcxS+xJm\nte5LmNXYlzBTXzYSxfvojQCuT5IaJqfAYwB+QnF4Q4r/9wIwL0lwLYCHAAw0QxcAHycJ/gng9wC2\nb4jfIADomdm0PT1JgsQMhwP4oxl+DuBTAAsAnA2gNYBpKPz1+P+SBEvMcDuAh80wEcBUAC8X9/Ou\nGZ43w0wAo5ME5zf4j8kXrQFcZ4a2AL4E8BoK8sgKFFzfCwC8VMJ+bgDw96LrdSoKgxRJgmlmmILC\nXzXzADxf1qMXTGsA18GsLcrQlzCL+hJJMg1m6svGoWVRrloPhb69DcAfUtpeBuCPAKYXJz4LUMjq\nOwrAcWb4AsASAJeiIJtcbYavAXwB4Ef19xMEo2emlqEQQgghRE5o6vKWEEIIIURJaNIjhBBCiFyg\nSY8QQgghcoEmPUIIIYTIBZr0CCGEECIXaNIjhBBCiFxQqzo97du3T3r27FlPh1K/fPnlqkWAW7Ro\nEW2rXgvR49P509rVJwsWLEBVVVXZv7ix+vKrr76KXi9fvjzYn332WbDXXz9eqJv7gm3+DAB8+umn\nwV5nnVVzet/nnTqtqqTeqlWrko69HEyaNKkqSZIO5d5vpY/NFStWRK832mijYH/00arCvn7Mff31\n18Hm/tx0003LfIS1p7mNzbxTH2Ozsfry448/jl4vWrQo2G3btg12u3btonbrrbdejfv7/PPPo9dL\nly4NNt+Du3fvHrXz9/GGIqsvazXp6dmzJyZOnFieo2pg3nvvvWD7h9yGG66qqs03XZ4oAekXRH0y\nbNiwetlvY/XlypXxMjv33HNPsBcsWBDsbt26Re2++OKLGu25c+dG7fj1xhuvKhjq+/xnP/tZsLff\nvuEKwprZwvrYb6WPzQceeCB6veOOOwZ7woQJwfY3V57EbrDBBsE+6qjGX5i7uY3NvFMfY7Ox+tJ/\nJ9/vDjnkkGAfffTRUbsuXbrUuL833ngjev3HP/4x2K+99lqwr7vuuqhdY03es/qySVRkvuqqq6LX\nV155ZbA7d+4c7IUL49/ZunXrYPPN0//Vuckmm9Ro88MVAPbYY49g33nnnSUcuQCAadOmBfuHP/xh\ntI3/AmHvy7HHHhu1GzNmTLAfeeSRYJ900klRO35ocj/zXzcAcO655wb7lFNOCfYxxxwTtfMeouYM\nT/jZw+I9nOxx8ey6667B5r/6nnvuuajduuuuuvXw9/JfkADwySefBPvAA1etQHHHHXdE7S69dNXS\nTYMGDQq2H8Pcn1m/Q4hKhK/n66+/Ptr2n//8J9j+WciTmWuuuSbYv/3tb6N27EHlMfr2229H7bbd\ndttg8x+Ue+65Z9Ru5513Dva3vvWtYDfmHy0a9UIIIYTIBZr0CCGEECIXaNIjhBBCiFzQJGJ6WFsE\ngP322y/YHBPCMTxAHDTLMT3t27eP2nXt2jXYO+20U7Dnz58ftWvIgNemxkMPPRTsZ555JtrGsToc\nwAoAm222WbA544DjgIA4VmSvvfYKNgcrA/G18uGHHwbbZylwLNhjjz0W7GeffTZq179//2CfddZZ\naM5w7E6psUyceQUAL720akH1li1bBvuggw6K2j3//KrF0t98881g8/kG4uBljunxMUIcBM8xPY2R\nfCBEOZk8eXKwL7vssmC/++67UTu+9/ksKh7b2223XbD5HgnE8Tk8djbffPOoHSf/cMJP3759o3ZV\nVVXBHjlyZLB9LNHYsWOD3aZNG9Qn8vQIIYQQIhdo0iOEEEKIXNAk5C0uYAfE7i+WsHxBJq7Ns+WW\nW9b4GSBOi+V9DBgwIGrXkEXsmgJcY4ddsCxnAXEhOpY8AGCHHXYIdocOq2pJsRwJxEUNOS192bJl\nUTuWRzgl2deZYNcwp1/6Yofz5s0L9n//+99o27777oumRlqxPw+nqM6ePTvaxv35r3/9K9r2j3/8\nI9icJst9C8T1O37xi18E20vZLF9yiu5TTz0VteP+5WMYOHBg1G7w4MEQoinx4x//ONgsOflwDr53\n+RpzDEtdLB8D8fMvS+7m7+J2vs4WHy+XDfH7O/7444PNoRL1gTw9QgghhMgFmvQIIYQQIhc0CXnL\nV4RlaSKrcizLYJw15CURXrrg9ddfr/F7AKBPnz61OexmD2fgdOzYMdh+vRWWJ737k9u+//77weas\nLiCWxfgzWW5Xlsh8NgPLJrwPL8OwS7Y5yFtZktbZZ58dbO6nXXbZJWrHlV99P3F1Zc742HrrraN2\nPFZZcuKK6AAwevToGvf3wQcfRO1Yoj7hhBOC7bP7uGLsrbfeGm3zVbuFaAymTp0avWaJiOXfrBCA\nUrMvfTvOymL8s5W/K+uews9nfg74UBHOIvNyer9+/VL3Xxfk6RFCCCFELtCkRwghhBC5QJMeIYQQ\nQuSCJhHT49OIfcp5NbxCLBBrni+//HKwvc7v4wiq4fgEYPV4lLzBcRNAHFvDGq+PmWLdmGN/gDgm\nhGOwvNacFm/BK6kDsYbM+D7m13y8Xif38UkMX4f1XUW0PnjwwQej15yez9Wnzz///KgdlyTg6q5A\nHJfFcTz+mjj22GODfcUVVwTbr7L+ox/9KNhDhgwJdu/evaN2HItw++23B3vMmDFRO16lmis8A8Aj\njzwSbB+r1FzgqrgAcMYZZzTSkdQvfO/28SiVjq/8zuOSSz34exPfM/2zK+0Zx+PBv+bz5uN2OLaI\n77n+Wc3PTI5N4ueFP95777032nbRRRfVeOx1RZ4eIYQQQuQCTXqEEEIIkQuahLzFFWCBOL2N3XZe\n2mAZi6UZL2+lpen5StA777xziUfcPJkzZ070ml2X7Fr1i9jxYp/ezZomafnKvOw25b707lTvNq3p\nWP33Ml4uY5nDS24zZswINi+I2lTwv5UrmF955ZXB5sVZgbjqtV+Ul88JLwz78MMPR+3OOeecYHMq\nupciWEr77ne/G2wvl/GipVxKYMqUKVG72267Ldi+JAXLfSeffDKaEr4CL4+fp59+Otjcx36bv75Z\nnuQx7Kv4ViJNTdJiuBo6EC/qyxWTs6T3rMW3S12km++tPsWc76e8gLevxu9ltmp8qATv3y9YLXlL\nCCGEEKIOaNIjhBBCiFzQJOQtdq0CsWusR48ewfauNJa72I3nF59Mk1W8TLPFFlvU5rCbHQsXLoxe\ns+zBFZO9bJC26JyHpSl/7nn/3F9e+kyrXlpVVZX6vXxM/tj5mHzmmT/GpoaXrRjua79IJ0tJ/rxy\n1eojjjgi2E8++WTU7swzz6xx/74/L7nkkmDzfcDLmHfffXewx40bhzRGjBgR7K222ira9tprr6V+\nrhLh+5uXg3kbL8DqZZ+ZM2cG2597zqRjWcVLIgzfg/39OE1y8u/zb2GbMwP9/rt27Rpt42Nk6W/o\n0KFRux/84Ac1HlNDwwvoetmZQwL4fPjzxudj1qxZ0TaWJPmZ2a1bt6gdZ6+yhOzDAfj+wOEi48eP\nj9rx2OZ7sw9L4Owwf31NmzYt2IMGDcLaIk+PEEIIIXKBJj1CCCGEyAWa9AghhBAiFzSJmB7WAoFY\nh+SVz72GPHHixBptTrED4uqurF37KpR5X4XZp8VyXEWvXr2C/dJLL0Xt3nnnnWB36dIl2sapjxyz\n4VMk+bt4hW2ftsl6OKeb+77jeBzexyuvvBK122OPPYLt0yyz4oSaAq+//nr0mnV2jqHyMXAcS7B4\n8eJoG8cLcHzPv//976gdj2mOF/DXB4/vyy67LNh8TXm+/e1vB3uHHXaItn3/+98P9je/+c1o26GH\nHpq6z0oka3VrXmme49T8OOC+9CvX87jgdj4eIy0tOavqLpO1IjjfH3xJEr6P++uQKwtzSZK33nor\nalcpMT1cOsBXhef7Ip8r3/8cx+Njl/gc+Ocpw+eNrxW/2gFfD1zKxcf+PPfcc8EePnx4sH0MGvez\nj5V84okngq2YHiGEEEKIEtGkRwghhBC5oEnIW97lzVUf2W3n3ansoj/44IODPXbs2KgdL5rI7kNe\n6M1/bx7xbm0+V9xHPuVwwoQJwfZyA6ehsjs9y+XN+ErL/DmWKj0sn3Fqpq8Y7WU2xle3bWr4ccDj\n56CDDgr2fffdF7XjccXVYoG470eNGhVs3+8M9yFXggbi/mBJa88994za8cKZRx55ZLCvuuqqqB1L\nZFOnTo22Pf7446nH2FikSUJALDEsWLAg2sbVlFny9bICS7ZevuWxxDb3vz9Glkf8d6V9Jqt6Ml+T\n/p7A49un2/N1yc8IL5H5MdBY8HXpnzNXX311sF988cVgc8VzIJbzvYTFstDf/va3YPtznyYz+WPi\nezWXvpg9e3bUjqutc1/6hYV5f37M7r///ign8vQIIYQQIhdo0iOEEEKIXKBJjxBCCCFyQZOI6fFp\ncKwpczqf12tZyz7uuOOCzSW/gVhPZG3Yr8buU93zhk8TZg2Z42L8eeN+8Ss0c5osxxT4uAGOrWF9\n2Zem53YcU+Dje3gphZ49ewbbp0uyDu3je/xK302Nf/3rX9Hrww47LNinnnpqsDk2BwDmzp0b7C23\n3DLaxn3I18eFF14YtePV2DlOwceV8PIC3/nOd4J97rnnRu04LfvOO+8Mto/b4XTggQMHRts6dOiA\nSsPH9HCaMo8dH9PDY4Tvg77EA8dw+BRojs/JirHjUhZZMTgcP8K2/15+zfcLH3/Cx+fjORn+jb7s\nhk/TrwR4iRb/muNxfve730XteCxyiRYgjl3ie6tfvoO3Zd0/eTX2Rx99NNg+7pNjy3j1+FNOOSVq\nd95556GhkKdHCCGEELlAkx4hhBBC5IImIW95lyy73dj96eUtdg1zWp13k/I+2GXq8ZJL3vArkPMq\n1SwX7b777lE7diF7uYFX4WUXtZfB+NyzC9X3edpK7b7vuB1v8+34u7y8VWpafVNhxx13DDZX2Pbp\npXwe+vbtG217/vnng82SMktdQFyplfvpwAMPjNpxFVe+/lhiA4BJkyYFm+Uy345T3bt37x5t49XZ\n+XsbE399s/TDpSC4qjwAjBkzJth8H/QyEF/fPsU8bUXvrBWy+TN+fHC7UqWptM8D8Vj193R+ZnBl\n4enTp0ftvNzVWGStTs+/mytIe3mL77NeduYxxjKuL/nB1xtLzf5+zMeYdQ3Nnz8/2B07dgx2lpzl\nr4dy32fl6RFCCCFELtCkRwghhBC5oEnKW+xaYxenz6Zhl5x39zFpVXzr283W1PDng6vv8rnv169f\n1O6xxx4LdqnZEr6iKPclZ2xx9h4Qu2uz3Ob+mqrGS1h8vD4rLW0fTQVeHBSIF1flRWN9n3E2yZQp\nU1L3yZLLHXfcEbVj+YizIsePHx+146rXnMXp++nvf/97sNkt79uxDOKz1zizq1LkrayKzJwNs/fe\ne0fb0ioje7mMqxV7qYfvd3zevITB8ktWpeWsfaS149/hjz1r8U1+zfvLknMaEz7GrD5nsqQ5f+/j\nccD3T98PfBy8f19lnzMC+f6QtTBpqdT3c1aeHiGEEELkAk16hBBCCJELNOkRQgghRC5oEjE9vhIy\na+8c0+PTzTnmgiuU+lgMTrlLW1kYqJs+2ZzwOnFazIzXf7lcgI8PSYuL8Vo7x1rxtqxV0FmT9n2Z\nVn7AVyh9+eWXg+2PneMDOPbExxlVEpy66ldp5pWYf/zjHwebU02BeOVkn8p6wAEHBJvPiV+Nncsa\nPPTQQ8HmFFcAePXVV4N91FFHBfu1116L2i1fvjzYJ554YrA5fR2IY/s4LR8AbrrppmCff/75webS\nDA1NXe85afEePvYlKy6Et/nPMTwe2S51lfWsfafF5vh9ZG0rdSX5SqHUPvf3zqzPcV9mxQ9xO+4/\nP375Ppl1DfH9wT8X0vDHVO7nrjw9QgghhMgFmvQIIYQQIhc0CXnLpy+z1MEyhV8UjStxMptsskn0\nmmUxlku8654XI80jWSmtLDlmSU6+j1gi431kyVHsovYSG18rvM2nqjKcftmtW7doW1bKOh8T76OS\n5S2Wi/w55uv71ltvDfYxxxwTtauqqgo2LwIKxFIQS1D7779/1I7Tz7nPdtlll6jd6NGjazxeX56C\n++Kcc84JNlewBYA5c+YE+6yzzoq2cfVm/l6/AGSlwPc3L+2zJMnyvb+XpoUKAOlVgrPkBpZE/P2C\npa9ySBZ8TF5KS5PZ/OLV/h5fCZRD3slKxef9Z0lTaVWXgfhesc0226R+L9+DS03Fr+8wEnl6hBBC\nCJELNOkRQgghRC5oEvKWd0mmReZ7F5yXsarxLl4mawHTPMLZPh7OHuDKrj5Kn12cnEXnSVvszsMZ\nHT6jiiUolkOy9uevG4aP3UtpfK2UumhiY8OyEEs9WXz729+OXrOUx/0OxBlb7Kb2Y5GlGc4M4Qwy\nIL6WOAuQFywF4kqwLPX4Y3/44YeD7d3tAwcODDb/xsag+tiyMptGjBgR7HvuuSfaxuOA+yFLfsha\nSDRLcuB2adWZ/Wsem16aSpPSshbi9Pd0Hu9p31vTdzcl/LVRqnzElPqM8/2fJpn60Aau/lyqbKXs\nLSGEEEKIMqBJjxBCCCFygSY9QgghhMgFTULQ9Dosp9mxJunTD0tdBTstboPTOfPKu+++G+ws/Tur\nsqlPL077HGvUWX2XdRwcH1Lq8WatJpyV0poVC1SpdO/ePdhez580aVKwhwwZEuxDDjkkavfAAw8E\ne+nSpdE2Tvvmc+nj8hhOf/VVkjmOp0+fPsGeMGFC1I5/y+233x7sM844I2rHafS+jAGnefvYooam\nOo7BxzPw/Ynjdvr27Ru1O/bYY4PNVah97Fla3CMQj8FSY3oYn0bP55v3V46YDR9bxuOWU/H9eK7E\nisylUtdj57GSFQfE/ZIVP8T92hTOpzw9QgghhMgFmvQIIYQQIhc0CXnLu9Y222yzYHu3JpMmb3i3\nPrdjl2wlVutsTLzrks9jVhmARYsWBbtnz57RtnKnetclBZVlzLQq3sDq6fEsh/hK05XKO++8E+xx\n48ZF28aPHx9sPid+HKxcuTLYo0aNirax5MeyAldtBYBnnnkm2HzOv/Wtb0XtOHX89ddfD/bgwYOj\ndiylsSTrK2wvXrw42L4cA1eN9tdpQ1MtH3jpJ00+2H777aPXBx98cLBZCvQVePm69RXHeVupEhQf\nnx/bfL/ge3pdS4OwrOLHH29jmW7JkiWpx9vUKPX81hWWsHwfpZUzyJLLKqUEjDw9QgghhMgFmvQI\nIYQQIhc0CXnLSycsYbC85bNp0lyyXgJh1zi77bIWqcwj3hXMLuUsKZCro/rsqFIph7s2DXYT+6yx\nLbbYItgzZsxI3UdTuVb42vfn9G9/+1uw33777dR98G/lbCi/f66SzNIUEGd5sYTlF/Xt0KFDsFma\n8tlKLHc98sgjqcfO8vW///3vaNuf/vSnYLdv3z51Hw3B2mY0ceVzrorLVZwBYP78+cF+8803o218\n7rOqm6dVJvdyRqlZQ0xWlldWhicfb1b156yK85WOP4dZ8lbaufcS2drKfb7PeX+Vco+Up0cIIYQQ\nuUCTHiGEEELkAk16hBBCCJELmkRMj4e1XE7BXbhwYdSuS5cuNX7e65bz5s0LNqfZshaeV1i/9+eN\nX2fF9HCabFZcEOvLpeq//nvTUuB9HAJr3hzn4WNZslLxmaZSnZnHTteuXaNtU6ZMCbZPMWeWL18e\nbH++uSI2r1r+wgsvRO34PA8fPjzYPk6BxyDHX/ixzeOWx7OHv9dXieaYrUMPPTR1Hw1JqTEXnKYP\nxLErfA1z1W0A2GGHHYLt0/unT58ebO4XH3fFx8jtfFwJH1NWOnTab8yKN/HnKe1+5EsR9OjRI3Wf\nlU7WauQ+/onPMW/z5zStL7P2l1V+gD9XKWU95OkRQgghRC7QpEcIIYQQuaBJylu8gOVDDz0UbHat\nA+lpn34hPHbjsluUU27zCktTWYv1ZVVWZhnBt0srEeAXKeXj4FRaLz9x32ZV62b4uuFqw0DpFZ4r\nJR1zTXCl5Xbt2kXbWLbYZZddUvfBKeZVVVXRNk7x5/PvF+/lVGm+PvzY5NT2V199Ndhbb7111O62\n224LNqesjxw5MmrHY9pLQizB7bXXXjV+pqHJknS4ujAvzArEC45y+rrvr/vvvz/YfpHV3r17B3vB\nggXB9ueNKznzeMySt3hbbWQaJkt+ScMfU6mLUjckpabze0qV/0pNSy/13JeKrwbeWMjTI4QQQohc\noEmPEEIIIXJBk5S3OBOHMzq8+yzNneZlD3bXsuvPZ7fkEZ/1xHA0fpbLlDMm/KKGDLuas/bH/Veq\nhOWzCvi7uP99xWg+9mnTpkXb2GVc7oVT6wt273PGExDLhqecckrqPqZOnRpslkAAoE+fPsHm8ePd\n4bzIKEtdb7zxRtTuscceCzZXSfYSC8vcrVq1Sj32IUOGBPuBBx6ItnHF50pxxftswv/85z/BZknV\nV7y+4IILgr106dJg+8wrliP9d/E5ZinUnxuW2bIkpzR5q67Vp7OkZ75/cKYfZygClZNRxJR6PrwU\nzNRVMkzrl6wKz1nwcVSKlChPjxBCCCFygSY9QgghhMgFmvQIIYQQIhc0yZgeJq3SJJAeZ+H1Tq6m\nyzpxlmaaFz744INge02Wz09W5WKOsfAxBd27dw826+s+BZw1Zd6Hj0vhY8xaUTwt7sP/Ro49yYoV\naSoxPRwXw7EYAHDSSScFmyv1ejiOzqe2DxgwINjct2PHjo3a7bzzzsHmeJQnnngiasfxKFy5mVeE\nB+IxnRWnwauz+7ggLpMwe/bsYPuVyeubjz/+OMSejB8/Ptp2wAEHBNtXF2ZOOOGEYPPq6b7KPKfj\nc2wVALz88svB5nivoUOHRu3OPPPMYPN49PGAfH/m/qprinbWPjimh+9T/trYcsst6/TdlUBWXI3f\nlnUvZEq9j2XtI41Sq9vXN/L0CCGEECIXaNIjhBBCiFzQJOUtlhnYRe1dbmlubi9hsEuPqw5Xijuu\nMWGXvz9vaVWSPSwpDB48ONrWv3//ko6DXfScZlsOtt9++2B79y7/fr+oKLetlBTnNcHVjzfffPNo\nG/dhVskA/pyXB7iv+Xrx106/fv2CzQuEzpw5M2rHlZHfeuutYM+ZMydqx32RVWaBqw57SYSPgxcy\nbmhatWoVUus5xb42HHzwweU8pDrhpWex9nD1eD/2ssZAmhzlQ0JKrdacRtZ9sK6lCcqNPD1CCCGE\nyAWa9AghhBAiF2jSI4QQQohc0CRjejj9jjVIr9GnlSn3adOshXKqX6krbOcFrwtzjAundns4tfa9\n995LbcfbvLbMMRwc05UVn5V1fPw5Xmma43uAOH7Ix/usrf7dGHB6+CabbBJt4/i4LP09rcQ/EMfE\nPf3008F+7bXXonb77LNPsHnc9urVK2rH/Xb99dcH26dec6xDVpwY96e/D/Bv5tiiI488MnV/QjQW\nPi2dX/t7dV3iU3l/WWUF0koReCplyQ95eoQQQgiRCzTpEUIIIUQuaJL6TZq84eWotOqSPq2OXX9s\nZ6W+5gVO2fbng92fWZU8d9xxx2Cffvrp0TaWM7JS0VnmYHnFr4peKpdddlmwR40aFWy/CjOvPM3V\ngQGgc+fOwS511eHGZu7cucHebrvtUtstWrQo2N26dYu28RgZPXp0tI3PCa+YzpV/gTg9nis8e3mR\n5TiuGLzhhhtG7XgbX7MeluN8SjX3L6fUC1EpZK18zvdgPz4YlqD8s5DvrWxn3d/5OPwx8f61yroQ\nQgghRAOiSY8QQgghckGTlLdmzZoVbHaNe9Ikh6qqqui1zwSphiu05hXOxvHnml2eHTt2TN0HuzV5\ncUIA+NWvfhXsU045JdheqlywYEGwV6xYUePxAXHV3oULFwbbS1O8fy9pMbzwqa+Ayt/tqzVXKpwh\nxwtKAvHvOfnkk1P3ceuttwb70ksvjbZNnjw52Hy+/OKYY8aMqfGYuPI2EMtsLHHuvffeUbvDDjss\n2KeeemrqsTM+o4UlLa5cLURDkpUBxds+/fTTaBvfq/w++FrPynjOkqoYlq2yvpfb8X27MZGnRwgh\nhBC5QJMeIYQQQuQCTXqEEEIIkQuaZEzPyJEjg/3YY48F28d3nHDCCTV+/qqrrope33XXXcHmOJUj\njjhirY6zOdC1a9dgc3wFEKcns52FT0m+5ZZbgs3xHD49nmOGOI3ZV/lkvbpHjx7BPuCAA6J2vNp2\nFmmp8v67s2KaKolzzz23Rrs2cDrs5ZdfvtbHdN111631PuqCj2kSohLIqobOVcQ333zzaNsXX3wR\nbF/Kg0tvZJViSUtn9zGLad/lU+U33njjYPvq7Y2FPD1CCCGEyAWa9AghhBAiF1hWetxqjc3eAbBw\njQ1FOemRJEmHNTerHerLRkP92XxQXzYvyt6f6stGI7UvazXpEUIIIYRoqkjeEkIIIUQu0KRHCCGE\nELmgoic9ZtjMDFOL/5aYYTG9rowlW0VJmOGrYr/NNMM9Zmi1hvbPmGFY0V5ghvYNc6RijZh9BbOp\nMJsJs3tgltmXMHsGZsOK9gKYqS+bIDSGp5lhshl2aexjyit1fTaaoacZZqZsu9QM+6RsO9EMXdx7\nx5jhQjPs0ZSuhYqe9CQJ3k0SDE4SDAZwI4D/rX6dJPjcrGHrDJmhxZpbiRQ+KfZbfwCfAzi9sQ8I\nAMxgZpU9DiqQT5Akg5EkFdWXMDOYqS/rj+oxPAjALwBc0dgHlFfW9Gys4z4vShI84d8vPvdOBOAL\n7RwA4D8A9gA06ak3zHCLGf5ghqcBXGmGwWZ40QzTzXC/GTYttmNPQXszLCja25lhQnFGPN0MWxff\nP47ev6l6gmOGD4sz4PEAhjfKj25+jAXQu/gXwiPVb5rhejOcmPVBM5xb9BbNNMPZxfeuNMMZ1OYS\nM5xXtM83w0vFvv5N8b2eZphjhpEAJgPoVvZfmB/GAugNsz1gFvoSZtfD7MTMT5qdW/QWzYTZ2cX3\nroTZGdTmEpidV7TPh9lLMJsOs98U3+sJszkwU182LJsAWA4AZmhthieL3p8ZZgirv5rh12Z42Qz/\nNcO/zPCzRjvinJH2rAPQwgx/McMsMzxuhpbF9reY4ciivcAMF5nhOQDHABgG4PbivlqawQAMBvAe\nCn/0nFPctpsZehSvh+nF/7vT/m80w1gzzDXDwQ18SgA0wUlPkT4A9kkSnAfgHwAuSBIMBDADwMVr\n+OzpAP5UnCEPA/CmGbYFcBSAbxTf/wrAscX2GwGYmSTYKUnwXNl/Sc4oeucORKGvavvZoQBOArAT\ngJ0BnGqGIQDuRKH/qvkugHvMsB+ArQHsiMIAHWqG3YtttgHwjyTBkCRRSmmdMKtzX8Jstb6EWWpf\nwmy1voRZ1JdIkiFIEvVl/dGy+GB7GcBfAVxWfP9TAIcnCbYHsCeAa4oe1GEAjgAwBMC3UbjfioZj\ntWdd8f2tAfxfkmA7ACtQ6KOa+DRJsGuS4J8AJgI4tuhJ+gSFPp2WJJiP2NM0FsD1KNxbBwK4HcC1\ntM+eAEYA+CaAG80Ql3BuAJrqpOeeJMFXZmgDoG2SYEzx/VuB8FBLYxyAX5rhAgA9ih24N4ChAF4y\nw9Ti617F9l8BuK/cPyCHtCye24kA3gDwtzrsY1cA9ycJPkoSfAhgFIDdkgRTAHQ0QxczDAKwPEnw\nBoD9iv+moOAF6AuEv3YWJgleXKtflF9awmwqytCXSJKPkCShL5EkUwB0hFkXmA0CsBxJssa+RJKo\nL+ufanmrLwrSxj+Kf/EbgMvNMB3AEwC6AuiEQh8/mCT4JEnwAYCHG+vAc0pNzzoAmJ8kmFq0J6Ew\nEamJu1LeBwr9Pzpl23AAdxTt21C4Dqq5O0nwdZLgVQDzUBjHDUqTXHsLwEcltPkSqyZ1YTaZJLij\nKFV9E8BjZjgFhUF7a5LgFzXs59MkwVc1vC9qxyfFvzgCZlEfAVjjrD99URrgXgBHAtgcBW9Bdfsr\nkgQ3ue/tidKuIVEzhZgexqxB+hJJEvUlzHpCfdngJAnGWSG5oAOAg4r/D00SfFEMJdgQ2X0syowZ\nDscqpeOUlGfdPAC8kNZXQEHeqoGscbUf0j1EniTFrul1vdNUPT0AgCTBSgDLzbBb8a3jgeD1WYCC\n9wYo3EABAGboBWBekuBaAA8BGAjgSQBHmqFjsU07M6xarVLUFwsB9DPDBkWv3d5raP8sgG+ZoZUZ\nNgJwOAoxJUDh4Xg0Cn19b/G9xwCcbIbWAGCGrtV9LMrOQgD9YLYBzEruS5i1glnJfQmzwuqGZl1h\npr5sJMzQF0ALAO8CaANgWXHCsycQ7p3PATjEDBsWx+A3G+do80GS4H4KZp6Y8qyrKx8A2BgAivfq\ndZME7/ptRV5AYfwChTARDgv5jhnWMcNWKKgpr6zFMdWJpurpYU5AQRtshcIs9qTi+78HcLcZjgfw\nFLU/CsBxZvgCwBIAlyYJ3jPDrwA8boVMni8A/BgqH16vJAkWmeFuANMBvIqCdJHVfrIZbgEwofjW\nX4vSFpIEs8ywMYDFSYK3i+89XozXGmeFvzk/BHAcIM9d2UmSRTAruS+RJJNhdguoL4vSFpAks2C2\nMYDFSJK3i+89DrNtAYxDoTPVlw1PtUQNFLw4JxTDDG4H8LAZJgKYCuBlAEgSvGSGhwBMQ+FeOhHA\nygY/6vyy2rMOhQD0unALCs/ZTwBcA0RZXg8DuLcYwP4TAGcBuNkM5wN4B6ueyUBhkjMGBfnz9CTB\np3U8njqjZSiEEELUC2ZonST4sPhH6bMAfpgkmNzYxyXqjhn+isIfnLWKoyv+wfpIkgTvbaPQHDw9\nQgghKpM/m6EfCjE+t2rC0/RJEpzS2MewNsjTI4QQQohc0KQDmYUQQgghSkWTHiGEEELkAk16hBBC\nCJELNOkRQgghRC6oVfZW+/btk549e9bTocR88cUXwX7jjTeibR9//HGNn2nRIl4Eff311w/2Rx+t\nKi7ZunXrqN3XX3+9xn0DwKabbhrs7t27p7YrJwsWLEBVVVXZK5s2ZF+KVUyaNKkqSZIO5d6v+rPh\n0dhsXtTH2KyUvly0aFGwv/rqqxptAPjss1XFmtddd9X0gJ+l/nNmq4ZAr169UAlk9WWtJj09e/bE\nxIkTy3NUa2DJkiXBPuOMM6JtU6asqnvG2Wc8KQGArl27Bnv8+PHB3nXXXaN2PNGZOnVq6jEdccSq\nqtsjR45MbVdOhg2rnzX6GrIvxSrMrF4KXqo/Gx6NzeZFfYzNSunLc845J9jvv/9+sN97772o3YIF\nC4LNz9Nu3bpF7T788MNgs7Ph7rvvXutjLQdZfSl5SwghhBC5oF6KE7L3hV1fnmXLlgX76quvjrbd\ndNOqdQXZYwPEclenTp2C/eWXX0bt9tlnn2A/8sgjwfYz77vuWrWY7KxZs4JdVVUVtePPDRgwINg7\n7LBD1O6aa64Jtvc+CSGEEPXJypXxah+LFy8O9sYbr1omq02bNlE7lqfefvvtYK9YsSJq9+mnq1aP\nePHFVYWZOYwEADbaaKNaHHXDIE+PEEIIIXKBJj1CCCGEyAWa9AghhBAiF5QtpqfUOB6OIr/zzjtT\nP8PaYqtWraJtrBNyKp2P6bn88suD/eijjwb7v//9b9SuQ4dVmW28b9Y+gdVT4qt54IEHotdjx44N\n9j333BNtGzx4cI37EEIIIcqBL/OydOnSYPPzdIMNNojarbfeesHm5x2XkAHi53XHjh2D/fLLL0ft\nhg4dWpvDbhDk6RFCCCFELtCkRwghhBC5oGzyVpqk9ctf/jJ6PXr06GBzZWQvJTF+W5cuXYK9ySab\nBLtt27ZRu6222irY7777brBHjBgRtZs/f36wOTVv+PDhUTvexsWZfFo6S2RnnXVWtO32228PNhd8\nYnkQyJYIhRBCiDT4eQfEkla7du2C7Vcg4IrM/Nxdvnx51I6fuxxW8vzzz0ftJG8JIYQQQjQSmvQI\nIYQQIhfUWd7yC5VxpPdbb70V7Pvvvz9qxwt1ckVmL+/wIqDetcbR4h988EGwP/nkk6gdy2fsjvNZ\nWCwl7b777jUeHxAvwLb55psH21erZPmNzwUA/OAHPwj2448/XuMxCCGEEHXFP09ZquJtXFkZANZZ\nZ5UfhJ/BHn7+8/PY768SkadHCCGEELlAkx4hhBBC5AJNeoQQQgiRC+oc0+OrH3OczF//+tdVX7Bu\n/BUcu8LpcasdGH2O08P95zg93OuYkydPDjbH4PiVYDnOiNPXOV4IiH8zV6jkeCEg1jt9Ojvvk3+H\nr4wphBBC1AW/yjo/G/nZ5Z/BLVu2DPaCBQuCveGGG0bteHV2jgPyz9ZKRJ4eIYQQQuQCTXqEEEII\nkQvqLG9lyTHjxo0LNi9gBsQyGMtCLFMBwDvvvBNsdrkBsczE6ey+cjO74Fgu22yzzaJ2nJrHn+HK\nlUDsxlu0aFGwfao8/2Z/nrhS5h/+8Idg/+IXv4AQQgixtvjUcf8crsbLW3PmzAn222+/Hey99947\nasfPZB/qUunI0yOEEEKIXKBJjxBCCCFyQdkWHJ06dWqw2bXmFwFlmWnevHnB7tu3b9SOJSjvPmMZ\ni7f5KtFckdlXYU6D9+GlKc4822KLLYLtM9TYleij2du3bx/ssWPHBlvylhCiMcmqsl8Xvvvd70av\nWUr5zne+E+zBgwdH7Xr27BlsllGyjofDDYD4eTR37txgn3feeZnH3Fzw54pDOLgfXn755ajdDjvs\nEOxBgwYF22cyp4W3rO010xDI0yOEEEKIXKBJjxBCCCFygSY9QgghhMgFZYvpGTVqVLBZh11//fWj\ndqwnduvWLditWrWK2n3++eep+2DdkGN6fFoeV6X0+2e4QiVXrvz444+jdrx//l6vY/I2X5GZfxd/\nr6gbTzzxRPR61113DTZXEfXVuuuyqn25Yx6aKnwu/Xnlsg5pnwHi88/xBmmfB4Ann3wy2Lvssku0\nje85/rvSjsNfA7wt6ziaK1m/ma/9rOue7+m+5MeWW24Z7GuuuSZ1H9wPPIZ9aRB+LvjUa46z5Hif\nYcOGRe1GjBiRehxNGd9H77//frD5ucNVl4E45onH5Z///OeoXVrMbdbK7JVC/ka2EEIIIXKJJj1C\nCCGEyAVlk7fefPPNYHPKul+ojCUndncuXrw4ascuTr+g51tvvRVsTkv37bx7Ne2YuKozu3H951nu\nYvehlz34ta+MyYunrlixItg+JdBXl84bt912W/Sa0/u5Wvfzzz8ftbv11luDfeCBBwa7LnKWJ8ut\nnyWbNDf492VJRFkyE3+OZZWnn346anfFFVcEm8eVl7cuu+yyYLOLvdQ+88fB1dJ/8pOfRO3Sqts2\ndXxflippHXHEEcE+/PDDg+1DCh599NFg8/3O3485BIBZtmxZ9JqPyUtzLHeynOMX4myuZEmVLAv6\na5lXRuCUdR4PQPrzzz+DKxF5eoQQQgiRCzTpEUIIIUQu0KRHCCGEELmgbDE9DOuEHC8DxHEyrOvy\n6uZAdhor646cqsgrmAPpJcx9qjin83Xq1CnY7733XtSOj5H1Za+Ldu7cOdg+zZKXr+Df8corr0Tt\nfGpl3hg5cmT0esmSJcHecccdg33ddddF7TgWaNq0acE+44wzonZ10Z59qXuON+FYrX/84x+13ndT\nJStuJyuuYPz48cE++uijg+1LPHA/dezYMdi+fD6TFX+SFY90//33B/s3v/lNjd8LAMcdd1zq/psT\naefx5JNPjl7/7Gc/C/bw4cNT93fWWWcFe8aMGcGeOXNm1I63cdzjtttuG7XjWCAfA7nffvsFm+PE\npk+fHrU79NBDU4+3KePLvDD8vPJlWbp27RpsPt8+FZ2fmZyynlUaplKQp0cIIYQQuUCTHiGEEELk\ngrLJW6+++mqwO3ToEGwv/XC6Oa847itqsizk3dDs1mSJyKeHc3oiHwfLXgDQo0ePYHO6pD92TpVO\nS7kFgLfffjvY22yzTbSNfzNXw6yqqkJDUy1N+JR7hl3cXsrg13WtTvzSSy8F+4Ybbgi2X3mZV6tf\nunRpsP11w/s44IADgv3YY49F7Vg22WuvvYLt3eQPPvhgsH0/s9zJ14NPu95zzz3RXMlKz+cU5bPP\nPju1XVYFd74P8DU2e/bsqN3f//73YJ900knBZjc8EF9vvp9YYt5uu+2C7eXK5iRvlVpq4dxzzw32\n6NGjo20333xzSd/VpUuXGu3999+/pM/XFb7P3n777dG2X/3qV/X63Y2Fl6P42ZhVQXnAgAE17s8/\nM/lzfA1J3hJCCCGEqBA06RFCCCFELiibvMVSEmcvbbDBBlE7zl7i7CifeZXlJmNJg12y3jXOUgXv\nz8tgaYuMshvQ74O/10sim2++ebD972I5jmW6xqgUWv0beHG+UtrXlhdeeCHYzzzzTLTtwgsvDPZO\nO+0U7GOOOSZqN2fOnGBzP/jsLZZDtt5662CzrAjECyOyhOWrvvICpj7DkPud5beHH344atdQ8la1\nyzkra6oui3HWpt+vvfbaYLOk1b9//6gdjwPO4vTjgLMpWX7izwPAX/7yl2Dz4oivv/561I7vHV5C\n5XHL9xLONANWZRI2xoLBLEWznVVdOquCdFbfXnzxxcHmbDmugl/qsdZ0jNV4abzU640XoP3nP/8Z\nbbvzzjuD3bZt22D77ECfodtcyOpzlqa8vMXnivH3Pu5bHjdNYaHeyj9CIYQQQogyoEmPEEIIIXKB\nJj1CCCGEyAVli+lhnZe1W451AOL08D322CPYrMkDcexPz549o21c/Zb37+OHWCvmz/jV03kfvXv3\nDraPdeF9vPHGG8HeYYcdonacsj9q1Kho22abbRZsji2aP38+GgtfpZTjHvi3+fgkjnXglHtOKQfi\n6sonnHBCtO23v/1tSftgrZjjPHwqJa+ynlZB278+88wzgz1r1qyoHceE+OuGj5dtvoYakupx53X6\ntFgKr7+n6fFTp06NXvOKyzwOgHjV+2984xvB9vcBHpvcn/PmzYvaLV68ONgcp5D1Gzl+y1cP5uP1\n1eJ5HHC8nY/dqb5GfGxgQ8Dnra5lIhg+37/4xS+ibVx6gyv1cnV0IB6DvM3HgdQFLjHAq7kD8f15\n3333jbZxGjyPYR8vlFXZuynjr1n/bKzGr3Cfho+X9fGu1WTFDFYK8vQIIYQQIhdo0iOEEEKIXFBn\necu7z9j9xW5on4LKLmWuhOtT7LIWCOVF6Dh11bur2c3NrlCWqTy8zX8v74Pde37RNpZBslIH+di9\nTNMQVLsiL7nkkuh9/t2c4umrRvMx86KRXvLgxf84PRmIU8w5DdLLTCwL8rXh3axbbbVVsFkaGTRo\nUNSOj/G8884Ltk/H7dWrV7C9RMauYU63v+CCC9AYrK3cwZWH/+///i/YXvbi8efd47vttluNx+PH\nHMuXXjZMg68Vv5Avl8lgyWbcuHFRO/4un7K+yy67BJsXwfT3geqFZ7l6e0PB9zgeI/5YeNyOHTs2\n2HPnzo3acWVrlrAAYOjQocFeuHBhsP1Y4n1yyvNpp50WtbvxxhtRCnzu+d7hrzW+f957773RNi4n\nwdXXuao+sLpU11zw1wNLsTwusxYmZXyoR1pZgbqWNWlI5OkRQgghRC7QpEcIIYQQuaDO8paXOjhS\nnyPFvfusY8eOwWYXrJeBWLbwVTM5u4i/l92YQOzSY/ecz/zgffDx+owT/l0s2/n98W/hDAggzkTj\n39/QlUE/+ugjvPjiiwBWz0JhF3DaYqxALDE88cQTNX4eAPr27RvsBx54INrGGRns1vZZAJxFdvfd\ndwf7wAMPjNq9+eabwa6WIYDVr1deiPKwww4L9vbbbx+146xCX811woQJweYMNT4GoOGlS5afgHix\nVf7dTz31VNSOK9zymNh2222jdlzN2p9XlrH42vHyFl9zLNn4Ssssg3GWl/+NnP3IY5iP1R+Tvzfx\n+WBJy0uo1RJOObKn1oSvNPz73/8+2CzH+arwvAA0Hz/LtQDw4x//ONi8yCoQXze8P3/eOFuR7+/3\n339/1I4XheWFPr0cfNBBBwWb5RJfLZjvP5wxC8T3apbF/HOGz2FzwkvSfO74vJWaYeevr7Rq5P5Z\nWInI0yOEEEKIXKBJjxBCCCFygSY9QgghhMgFdY7p8fEurPGxnRWrwinAPu07K52bY3dYn/Tfxbou\n74/Tn4H02B8P66S8P7+aMP8WH6PA+jLHQ2Sl0dcHX3zxRYg36tOnT7SN9VvWaHl1cyDuP477GDBg\nQNRuzJgxwebV0oH4XHGcxpZbbhm147gbXrXdnzeONzjkkEOCzSnNALDNNtsEm2MSvObP18qCBQui\nbfvss0+wOWWWY4kA4Fvf+hYaknPPPTd6zSncvNq5r7DNFZQ5ndunCnP6stf2+TWXJ/AxFxyrw+OH\nY478/vh686UFeNzy9/qxybE/Pi6IY3R4DPh7QnVcTFqV23LyyCOPRK/5HsS/JSt2ieNWfLwdp5v7\ndHauhszXir/383nkWDwfA8cp8Q899FCw77vvvqgdn1d/r2b4uvSp0nzdcCyiP3Yfc9lc8DGRfH74\nfPj7Yhq8kjoQ3wPSvqdSkadHCCGEELlAkx4hhBBC5II6y1s+NY1djSzpePcZu7l9ijnD7mUvYWRV\nOWbYFcwuPU55B9KrP/u0P94HH59PXe3evXuwuUKw3ydXjPau4PqmTZs2ITX0qquuirZx2ilLOr5y\nbr9+/YLN0iIv8Ofb+TRWvlbYDe/ThKvT64FYPhs9enTUjt26fK35Y+eKzJyO62UTdn/z7wDiNFB2\n9/qK1JMmTUJ98+GHH4bKu74EQVqVai91cKo9/x6+ToG4z/x1y2MrS/7xKbBpn+HxwrZPm+Z+ZxnF\nu9v53PCixn4f3M5Xla8e36VWs60tS5cuxR//+EcAwCuvvBJt4xRjHnNvvfVW1I7vVc8991ywsxZq\n9fdZfs33d78PDj9g25e44PPIfekl0qznAsOyo6/QzbImnzN/3TXGorENgX/GMdwvXrZKw5cL4Hsm\nX0NacFQIIYQQokLQpEcIIYQQuaDO8lZWlVne5rOXeBu7TL2cwRkY3nXJ7jl2XXo5it2m7Db37lR2\nhbI73Ls+2WXIrmV/fCyreDdj2u8vVbIrF2YWvp+lIwC45ZZbgs2VhtmlCcTnlCUt7/5++OGHg+2z\nTFgyYtmKqzMDcXVY7hcvX7DU2KVLl2D7LBjmRz/6UbC9FMVVl0eMGBFt42w2liF8ts/Pf/7z1O8u\nFx9//DGmTp0KYPVzx+eBz5eX8vg1Z7n4jCqWPXwGFGfjcV9kVYjlbf4+wGOTrys/Nvl3cTt/ffA2\n/11pmSdevq6WX+qrIvMmm2wSMgOr+7QazrDi4/fnN+3e6scm/2Z/3fK9le9P/jzxPY6Pz0v7w4cP\nDzZnB37ve9+L2v3whz8MNt9XvPTJ93F/7Hy8bHt5yy+y2lzwzzjuM7ZLXew3a8WEtOdnpSJPjxBC\nCCFygSY9QgghhMgFmvQIIYQQIhfUOabHa3wc38DbODYHSNeQfeVN3sarKwPplVN9/Azr3GmxAUAc\nh8DH4eNP+Ni5nY+NYD3dp7uy1srt0latrU+q0wu9DnviiSfWaHu4IuzNN98cbJ+qyteAj8XgeCiu\nDO219mHDhgX7mGOOCbavEs3757gUX02ZK0NzP/sYtH333TfY/ndxOvtpp51Wow2UvpLx2tCxY0f8\n5Cc/AbD6mOPqt5yK7uNx+HrksePHC48zH0vC45bHnI+5SEtF9/BYL3UFZ1+FOQ1/D+Nj5JgFf//J\nOt5y0LJly1A5m+PrgLi6Npd/4NgzAJg9e3awufJ7VuykPx9pMVm+ej6Px+effz7YWdWUs+Bq3UuW\nLAm2738fa8bw8Q4cODDYPt6N72F5gc9NVmwuU2qsTtaKBpWCPD1CCCGEyAWa9AghhBAiF9TZF+Ur\nL3IKN6fI+sUK2bXG7lSuAgzEad9ZaacsZ2RJHYx3f3PFWf5d7GYFYlcgu/F8KiW7kL28xVIHyyON\nIW+tbXohLwp62WWXre3h1Cu80CYAHHzwwTW2O/rooxvicOqVX/7yl6mvWU6cPn161I7HC9u+JANf\nN/7a5zHIY8S70dMW9PRp4Hy/4O/17dIqwWZVIPayNFcCrtTU2+rFTgHgvPPOq/XnWeoC4urh/jfz\nvYrPja/QXW4uvvjiYPNCpzvvvHPUjq89v8gvS2vczl/L/r7QXMi6frNKP5RKmoTs5dNKRJ4eIYQQ\nQuQCTXqEEEIIkQvqLG9lLSSaVSmTJR2uyuozrzjzxrvq0tyV3pXNx8FZOT5Dh12oLLllueR5336B\nTf4cS0D+GPm76prpIERt4Gtzp512asQjEY2Bl6bqW6qqC2nScxY9e/Ys/4E0YfyzMC2T2T8L0yg1\nY7G+FuEtJ/L0CCGEECIXaNIjhBBCiFygSY8QQgghckHZyidyZU+uYOrjYjiFPSuVkNNJ/TbWDXkf\nPm2Vv5urz3rdkeNzOBXWVx5lOOaI00iBOFbJp6zzCtWc3uer/QohhBB1waeicxwPPydLrcjsy8Gk\n4Vexr0Tk6RFCCCFELtCkRwghhBC5oGzyFstWnH6etQAZpxmyXATE1ZC7d+8ebeO0d0779lUieRsf\nh198kl13vsIsw2n5vKDfNttsE7Vj96Ff1DFtwdW0irJCCCFEbfAVy/nZyKEUHIqRhX8u8rOVw0NK\nXRS4MZGnRwghhBC5QJMeIYQQQuQCTXqEEEIIkQvKFtPTrVu3YL/11lvB9mXOObWd8ctapGmGQJxm\nx9qlT7/jdHHe5leCZb2Tv8u342PnlHqO9QFindRrq5xKyPrnkCFDIIQQQqwtfukmfsbxkhIdO3Ys\naX8+XpafXRybqmUohBBCCCEqBE16hBBCCJELyiZvHXjggcG++eabU9ulrSbetWvX6DXLTF4S422c\nfudXamdXG2/z1SVZguLUce+qY2nq/fffD/b222+PNFj2A+LUP3Yz9uvXL3UfQgghRKn41dPT0sq7\ndOlS0v586Ajvj5+tm2yySa2OszGQp0cIIYQQuUCTHiGEEELkgrLJWwMGDAg2y0Xvvfde1I6znphB\ngwZFrx9++OFg+8wuht14fnFPdrvx9/p27KrjTCxfhZKzuXgfW2yxRerxcaVqf0y8v1IrYwohhBBZ\n+MrIvOIBZ1uVukCofxbyYtwcYuIXB69E5OkRQgghRC7QpEcIIYQQuUCTHiGEEELkgrLF9Gy66abB\n7tu3b7B9TE9a5eHdd989ev3iiy8Gm/VDAGjXrl2wOe27U6dOUTtOOeeKkj5lndPUOaWeV3r3++vf\nv3/q9zJDhw6NXi9btqzG7/LHJIQQQtQFXyqlT58+wV6+fHmwt9pqq5L2t8suu0SvX3jhhWAvWbIk\n2L17967VcTYG8vQIIYQQIhdo0iOEEEKIXGAs7ayxsdk7ABbW3+GIGuiRJEmHNTerHerLRkP92XxQ\nXzYvyt6f6stGI7UvazXpEUIIIYRoqkjeEkIIIUQu0KRHCCGEELmgYic9ZvjKDFPNMNMM95ghM6fb\nDM+YYVjRXmCG9g1zpKIumOFCM8wyw/RiP+9Uhn2Ga2Bt2og1Ux/9R/vewwyPlGt/ohaYXQizWTCb\nDrOpMFv7fjV7BmbZY66UNqIkzLBZcUxONcMSMyym1+uveQ/Nm7LV6akHPkkSDAYAM9wO4HQAf2jU\nIyociwGwJMHXa2wsasQMwwEcDGD7JMFnxQlq7gdjU6GS+88M6yYJvlxzS7EaZqFfkSSfwaxi+lWU\nTpLgXSA8Oy8B8GGS4PfV2xt6jJihRZLgqzW3bBgq1tPjGAugt/8L0AzXm+HErA+a4dyit2imGc4u\nvnelGc6gNpeY4byifb4ZXir+Bfub4ns9zTDHDCMBTAbQrYavEqXTGUBVkuAzAEgSVCUJ3jLDRcVz\nP9MMfy5OMKu9M1eaYYIZ5ppht+L7Lc1wZ7Gv7gLQsvoLzHCDGSYWvRG/aYwf2YxJ678FZviNGSab\nYYYZ+gKAGTYyw83Fvp1ihsOK7/c0w9hi+8lm2MV/kRl2KH6mlxmGmmGMGSaZ4TEzdC62ecYMl5th\nDICfNtxpaHZ0BlCFJCmsipwkVUiSt2B2EcxegtlMmP0ZZgag2jtzJcwmwGwuzHYrvt8SZncWvUXR\nuITZDTCbWPQmaVw2EGa4xQx/MMPTAK40w2AzvFi8d95vhk2L7VgxaW+GBUV7u+L9d2rxM1sX3z+O\n3r/JDC2K739ohkvNMB7A8Eb50SlU/KTHDOsCOBDAjDp8diiAkwDsBGBnAKeaYQiAOwEcRU2/C+Ae\nM+wHYGsAO6IwUx5qhupS0dsA+EeSYEiSKAVxLXkcQLfiBGakGUYU378+SbBDkqA/CjfKg+kz6yYJ\ndgRwNoCLi+/9CMDHSYKBAH4LgMtfX5gkGAZgIIARZhhYj78nb6T1H1CYDG0P4AYAPyu+dyGAp5IE\nOwDYE8DVZtgIwDIA+xbbHwXgWv6S4iToRgCHAVgE4DoARyYJhgK4GYU+r6ZtkmBEkuCacv/YHPE4\ngG7FCcxImIVxiSTZAUlS47hEktQ4LpEkNY5LJEkYlzDTuGw4+gDYJ0lwHoB/ALigeO+cgVV9l8bp\nAP5UVF+GAXjTDNuiMG6/UXz/KwDHFttvBGBmkmCnJMFzZf8la0Ely1stzTC1aI8F8Ddg9b8E18Cu\nAO5PEnwEAGYYBWC3JMG1Zuhohi4AOgBYniR4wwxnAdgPwJTi51ujMAl6A8DCJMGLq3+FqC1Jgg+L\nE9LdUHgI3mWGnwP4wAz/D0ArAO0AzALwcPFjo4r/TwLQs2jvjuKDMkkw3QzT6Wu+a4YfonCNdwbQ\nD4i2izqS0X9A3E/fLtr7ATjULEyCNgTQHcBbAK43CzfMVbXygW0B/BnAfkUvUn8A/QH8t+hnaAHg\nbWp/V9l+YF5Jkg9hFvUrzH4O4AOY1WlcIkmmwywalzDTuGwc7kkSfGWGNij8kTCm+P6tAO5Zw2fH\nAbjQDFsAGJUkeNUMe6MwoX2pOCZbovCHDFAYz/eV+weUg0qe9ISYnmrM8CVi79SGa9iHZWy7F8CR\nADZHwfNT3f6KJMFN7nt7AoWJkygPRY33GQDPmGEGgNNQ+OtvWJJgkRW0aO7fz4r/f4X4ul2t0JQZ\ntkTBy7BDkmC5GW7Bmq8VUQtq6L8Tiptq6icDcESS4BXeR7GPlwIYhMK4/pQ2v41Cnw1BYXJkAGYl\nSaqrXOOzHCRJ6FeYReMSSbIIZpegjuMSZmFcIkmWw+wWaFw2JKWMEX7Ghr5JEtxRlKq+CeAxM5yC\nwpi8NUnwixr282klxfEwFS9vORYC6GeGDYqz1b3X0P5ZAN8yQ6uiO/1wFLxGQGGiczQKE597i+89\nBuBkM7QGADN0NUPHcv+IvGOGbao14SKDgfBArCqe/yNL2NWzKLpTi56Aalf5JigM8JVm6ISCPCrK\nREr/ZUm+jwH4ia2K0apedbgNgLeLSQHHo+C9qWYFCjfYy82wBwrXRwcrBFHDDOuZYbu1/zUiYLYN\nzFLHJcxqPS5hVuO4hJnGZSORJFgJYLkVYyNRGHvVXp8FWCVHhr42Qy8A85IE1wJ4CIU+fRLAkdXP\nSDO0M0OP+v8Fa0cle3pWo+gBuBsFd+irWCVDpbWfXPwrf0Lxrb8mSeEzSYJZZtgYwOIkKbjJkwSP\nF3XKcUV33YcAjgMqc8bahGkN4DoztEXhL4vXAPwQhQfdDBQG3ksl7OcGAH8vylpTUeznJME0M0xB\nwQ0/D8DzZT16kdZ/B6e0vwzAHwFML058FhTbjgRwnxm+A+BpuL9EkwRLzXAIgNEATkbhJnxt8Q+e\ndYv7nFXG35V3WgO4DmZtUYZxWZS1pqL6/psk02CmcVkZnADgRiuUgpmHQuwrAPwewN1mOB7AU9T+\nKADHmeELAEsAXJokeM8MvwLwuBnWAfAFgB+jwpfd0DIUQgghhMgFTU3eEkIIIYSoE5r0CCGEECIX\naNIjhBBCiFygSY8QQgghcoEmPUIIIYTIBZr0CCGEECIXaNIjhBBCiFygSY8QQgghcsH/BzwLFt53\n6jpOAAAAAElFTkSuQmCC\n",
             "text/plain": [
               "\u003cFigure size 1000x1000 with 25 Axes\u003e"
             ]
@@ -948,7 +978,7 @@
         "class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',\n",
         "               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']\n",
         "\n",
-        "def plot(images, labels):\n",
+        "def plot(images, predictions, true_labels):\n",
         "  plt.figure(figsize=(10,10))\n",
         "  for i in range(25):\n",
         "      plt.subplot(5,5,i+1)\n",
@@ -956,10 +986,33 @@
         "      plt.yticks([])\n",
         "      plt.grid(False)\n",
         "      plt.imshow(images[i], cmap=plt.cm.binary)\n",
-        "      plt.xlabel(class_names[labels[i]])\n",
+        "      color = 'b' if predictions[i] == true_labels[i] else 'r'\n",
+        "      plt.xlabel(class_names[predictions[i]], color=color)\n",
         "  plt.show()\n",
         "\n",
-        "plot(test_images, result_labels)"
+        "plot(test_images, predictions, true_labels)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Dm8BxJOm_4rG"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "(10000,)"
+            ]
+          },
+          "execution_count": 42,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "predictions.shape"
       ]
     },
     {
@@ -1017,7 +1070,6 @@
     "colab": {
       "collapsed_sections": [],
       "name": "overview.ipynb",
-      "provenance": [],
       "toc_visible": true
     },
     "kernelspec": {
diff --git a/tensorflow/lite/g3doc/tutorials/model_maker_object_detection.ipynb b/tensorflow/lite/g3doc/tutorials/model_maker_object_detection.ipynb
index a888ef3fe3a3cd..96a79921521eac 100644
--- a/tensorflow/lite/g3doc/tutorials/model_maker_object_detection.ipynb
+++ b/tensorflow/lite/g3doc/tutorials/model_maker_object_detection.ipynb
@@ -119,7 +119,6 @@
       },
       "outputs": [],
       "source": [
-        "!pip install -q tensorflow==2.5.0\n",
         "!pip install -q --use-deprecated=legacy-resolver tflite-model-maker\n",
         "!pip install -q pycocotools"
       ]
@@ -438,34 +437,23 @@
         "  original_image = img\n",
         "  resized_img = tf.image.resize(img, input_size)\n",
         "  resized_img = resized_img[tf.newaxis, :]\n",
+        "  resized_img = tf.cast(resized_img, dtype=tf.uint8)\n",
         "  return resized_img, original_image\n",
         "\n",
         "\n",
-        "def set_input_tensor(interpreter, image):\n",
-        "  \"\"\"Set the input tensor.\"\"\"\n",
-        "  tensor_index = interpreter.get_input_details()[0]['index']\n",
-        "  input_tensor = interpreter.tensor(tensor_index)()[0]\n",
-        "  input_tensor[:, :] = image\n",
-        "\n",
-        "\n",
-        "def get_output_tensor(interpreter, index):\n",
-        "  \"\"\"Returns the output tensor at the given index.\"\"\"\n",
-        "  output_details = interpreter.get_output_details()[index]\n",
-        "  tensor = np.squeeze(interpreter.get_tensor(output_details['index']))\n",
-        "  return tensor\n",
-        "\n",
-        "\n",
         "def detect_objects(interpreter, image, threshold):\n",
         "  \"\"\"Returns a list of detection results, each a dictionary of object info.\"\"\"\n",
+        "\n",
+        "  signature_fn = interpreter.get_signature_runner()\n",
+        "\n",
         "  # Feed the input image to the model\n",
-        "  set_input_tensor(interpreter, image)\n",
-        "  interpreter.invoke()\n",
+        "  output = signature_fn(images=image)\n",
         "\n",
         "  # Get all outputs from the model\n",
-        "  boxes = get_output_tensor(interpreter, 0)\n",
-        "  classes = get_output_tensor(interpreter, 1)\n",
-        "  scores = get_output_tensor(interpreter, 2)\n",
-        "  count = int(get_output_tensor(interpreter, 3))\n",
+        "  count = int(np.squeeze(output['output_0']))\n",
+        "  scores = np.squeeze(output['output_1'])\n",
+        "  classes = np.squeeze(output['output_2'])\n",
+        "  boxes = np.squeeze(output['output_3'])\n",
         "\n",
         "  results = []\n",
         "  for i in range(count):\n",
diff --git a/tensorflow/lite/kernels/cast.cc b/tensorflow/lite/kernels/cast.cc
index c0665e20961809..db8ae0a7226ade 100644
--- a/tensorflow/lite/kernels/cast.cc
+++ b/tensorflow/lite/kernels/cast.cc
@@ -89,6 +89,9 @@ TfLiteStatus copyToTensor(TfLiteContext* context, const FromT* in,
     case kTfLiteUInt8:
       copyCast(in, out->data.uint8, num_elements);
       break;
+    case kTfLiteInt8:
+      copyCast(in, out->data.int8, num_elements);
+      break;
     case kTfLiteFloat32:
       copyCast(in, GetTensorData<float>(out), num_elements);
       break;
@@ -125,6 +128,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return copyToTensor(context, input->data.i16, output, num_elements);
     case kTfLiteUInt8:
       return copyToTensor(context, input->data.uint8, output, num_elements);
+    case kTfLiteInt8:
+      return copyToTensor(context, input->data.int8, output, num_elements);
     case kTfLiteFloat32:
       return copyToTensor(context, GetTensorData<float>(input), output,
                           num_elements);
diff --git a/tensorflow/lite/kernels/cast_test.cc b/tensorflow/lite/kernels/cast_test.cc
index 2f24cc18dfa292..447a795e0331cb 100644
--- a/tensorflow/lite/kernels/cast_test.cc
+++ b/tensorflow/lite/kernels/cast_test.cc
@@ -231,5 +231,21 @@ TEST(CastOpModel, CastInt32ToUInt32) {
               ElementsAreArray({100, 200, 300, 400, 500, 600}));
 }
 
+TEST(CastOpModel, CastUInt8ToInt8) {
+  CastOpModel m({TensorType_UINT8, {2, 3}}, {TensorType_INT8, {2, 3}});
+  m.PopulateTensor<uint8_t>(m.input(), {10, 20, 30, 40, 50, 60});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<int8_t>(m.output()),
+              ElementsAreArray({10, 20, 30, 40, 50, 60}));
+}
+
+TEST(CastOpModel, CastInt8ToUInt8) {
+  CastOpModel m({TensorType_INT8, {2, 3}}, {TensorType_UINT8, {2, 3}});
+  m.PopulateTensor<int8_t>(m.input(), {10, 20, 30, 40, 50, 60});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<uint8_t>(m.output()),
+              ElementsAreArray({10, 20, 30, 40, 50, 60}));
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index edfd5ff86f2db6..cfb345f432deed 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -149,6 +149,13 @@ config_setting(
     },
 )
 
+config_setting(
+    name = "ios_sim_arm64",
+    values = {
+        "cpu": "ios_sim_arm64",
+    },
+)
+
 config_setting(
     name = "k8",
     values = {
@@ -242,6 +249,7 @@ selects.config_setting_group(
         ":ios_armv7",
         ":ios_arm64",
         ":ios_arm64e",
+        ":ios_sim_arm64",
         ":darwin_arm64",
         ":raspberry_pi_with_neon",
     ] + tflite_extra_arm_config_settings(),
diff --git a/tensorflow/lite/kernels/random_standard_normal_custom.cc b/tensorflow/lite/kernels/random_standard_normal_custom.cc
index 1fc8313ab6db37..1564dc7601ace0 100644
--- a/tensorflow/lite/kernels/random_standard_normal_custom.cc
+++ b/tensorflow/lite/kernels/random_standard_normal_custom.cc
@@ -55,7 +55,7 @@ void Free(TfLiteContext* context, void* buffer) {
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  // TODO(b/111309333): Handle optional seed input.
+  // The seed/seed2 attributes are not handled in this custom op implementation.
   TF_LITE_ENSURE(context, NumInputs(node) == 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index 23d24ccf3555f1..f1b6e2a8f02d0f 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -170,7 +170,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_CAST, Register_CAST(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE(),
              /* min_version = */ 1,
              /* max_version = */ 5);
diff --git a/tensorflow/lite/kernels/register_ref.cc b/tensorflow/lite/kernels/register_ref.cc
index b80c628748a61b..3ea72085e1ac6c 100644
--- a/tensorflow/lite/kernels/register_ref.cc
+++ b/tensorflow/lite/kernels/register_ref.cc
@@ -342,7 +342,7 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() {
              /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_CAST, Register_CAST(),
              /* min_version = */ 1,
-             /* max_version = */ 2);
+             /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE_REF(),
              /* min_version = */ 1,
              /* max_version = */ 4);
diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
index f041fb1be9c454..182ea4d0f864fb 100644
--- a/tensorflow/lite/nnapi/NeuralNetworksTypes.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
@@ -147,6 +147,8 @@ enum {
   ANEURALNETWORKS_RANK = 101,
   ANEURALNETWORKS_BATCH_MATMUL = 102,
   ANEURALNETWORKS_PACK = 103,
+  ANEURALNETWORKS_MIRROR_PAD = 104,
+  ANEURALNETWORKS_REVERSE = 105,
 };
 
 /**
diff --git a/tensorflow/lite/python/analyzer_test.py b/tensorflow/lite/python/analyzer_test.py
index 3666406e4f0dbe..2c9c05cf9c7793 100644
--- a/tensorflow/lite/python/analyzer_test.py
+++ b/tensorflow/lite/python/analyzer_test.py
@@ -48,9 +48,11 @@ def testMlir(self):
           model_path=model_path, experimental_use_mlir=True)
     mlir = mock_stdout.getvalue()
     self.assertIn(
-        'func @main(%arg0: tensor<1x8x8x3xf32>) -> '
-        'tensor<1x8x8x3xf32> attributes '
-        '{tf.entry_function = {inputs = "input", outputs = "output"}}', mlir)
+        'func @main(%arg0: tensor<1x8x8x3xf32> '
+        '{tf_saved_model.index_path = ["a"]}) -> '
+        '(tensor<1x8x8x3xf32> {tf_saved_model.index_path = ["x"]}) attributes '
+        '{tf.entry_function = {inputs = "input", outputs = "output"}, '
+        'tf_saved_model.exported_names = ["serving_default"]}', mlir)
     self.assertIn(
         '%0 = tfl.add %arg0, %arg0 {fused_activation_function = "NONE"} : '
         'tensor<1x8x8x3xf32>', mlir)
diff --git a/tensorflow/lite/python/analyzer_wrapper/model_analyzer.cc b/tensorflow/lite/python/analyzer_wrapper/model_analyzer.cc
index b90068c61fc991..370a4eff4706ac 100644
--- a/tensorflow/lite/python/analyzer_wrapper/model_analyzer.cc
+++ b/tensorflow/lite/python/analyzer_wrapper/model_analyzer.cc
@@ -170,8 +170,8 @@ void dump_model_summary(std::stringstream& out_stream,
                         const ::tflite::Model* model) {
   auto* subgraphs = model->subgraphs();
   out_stream
-      << "Your TFLite model has ‘" << subgraphs->Length()
-      << "’ subgraph(s). In the subgraph description below,\nT# represents the "
+      << "Your TFLite model has '" << subgraphs->Length()
+      << "' subgraph(s). In the subgraph description below,\nT# represents the "
          "Tensor numbers. ";
   if (subgraphs->Length() > 0 && subgraphs->Get(0)->operators()->Length() > 0) {
     const Operator* first_op = subgraphs->Get(0)->operators()->Get(0);
diff --git a/tensorflow/lite/testdata/add.bin b/tensorflow/lite/testdata/add.bin
index b4c02350c09130..5cdbb1930ebc4a 100644
Binary files a/tensorflow/lite/testdata/add.bin and b/tensorflow/lite/testdata/add.bin differ
diff --git a/tensorflow/lite/testdata/add_quantized_int8.bin b/tensorflow/lite/testdata/add_quantized_int8.bin
index ae28b183965f2a..6e64774e80d4d8 100644
Binary files a/tensorflow/lite/testdata/add_quantized_int8.bin and b/tensorflow/lite/testdata/add_quantized_int8.bin differ
diff --git a/tensorflow/lite/testdata/multi_add.bin b/tensorflow/lite/testdata/multi_add.bin
index e5048a32812bbf..399c51c2995bb9 100644
Binary files a/tensorflow/lite/testdata/multi_add.bin and b/tensorflow/lite/testdata/multi_add.bin differ
diff --git a/tensorflow/lite/testdata/multi_add_signature.bin b/tensorflow/lite/testdata/multi_add_signature.bin
deleted file mode 100644
index f8a6d213d35dcb..00000000000000
Binary files a/tensorflow/lite/testdata/multi_add_signature.bin and /dev/null differ
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index 38a05a680bb0a8..f50144a99aa3cf 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -302,7 +302,6 @@ tf_cc_test(
     data = [
         "//tensorflow/lite:testdata/add_quantized_int8.bin",
         "//tensorflow/lite:testdata/multi_add.bin",
-        "//tensorflow/lite:testdata/multi_add_signature.bin",
     ],
     tags = [
         "tflite_not_portable_android",
diff --git a/tensorflow/lite/testing/generate_testspec.cc b/tensorflow/lite/testing/generate_testspec.cc
index e7435e19f4952a..cb390243bce16e 100644
--- a/tensorflow/lite/testing/generate_testspec.cc
+++ b/tensorflow/lite/testing/generate_testspec.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <iostream>
 #include <random>
+#include <string>
+#include <utility>
 
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/lite/testing/join.h"
@@ -28,49 +30,59 @@ namespace tflite {
 namespace testing {
 namespace {
 
+// Generates input name / value pairs according to given shape and distribution.
+// Fills `out` with a pair of string, which the first element is input name and
+// the second element is comma separated values in string.
 template <typename T, typename RandomEngine, typename RandomDistribution>
-void GenerateCsv(const std::vector<int>& shape, RandomEngine* engine,
-                 RandomDistribution distribution, string* out) {
+void GenerateCsv(const string& name, const std::vector<int>& shape,
+                 RandomEngine* engine, RandomDistribution distribution,
+                 std::pair<string, string>* out) {
   std::vector<T> data =
       GenerateRandomTensor<T>(shape, [&]() { return distribution(*engine); });
-  *out = Join(data.data(), data.size(), ",");
+  *out = std::make_pair(name, Join(data.data(), data.size(), ","));
 }
 
+// Generates random values for `input_layer` according to given value types and
+// shapes.
+// Fills `out` with a vector of string pairs, which the first element in the
+// pair is the input name from `input_layer` and the second element is comma
+// separated values in string.
 template <typename RandomEngine>
-std::vector<string> GenerateInputValues(
+std::vector<std::pair<string, string>> GenerateInputValues(
     RandomEngine* engine, const std::vector<string>& input_layer,
     const std::vector<string>& input_layer_type,
     const std::vector<string>& input_layer_shape) {
-  std::vector<string> input_values;
+  std::vector<std::pair<string, string>> input_values;
   input_values.resize(input_layer.size());
   for (int i = 0; i < input_layer.size(); i++) {
     tensorflow::DataType type;
     CHECK(DataTypeFromString(input_layer_type[i], &type));
     auto shape = Split<int>(input_layer_shape[i], ",");
+    const auto& name = input_layer[i];
 
     switch (type) {
       case tensorflow::DT_FLOAT:
-        GenerateCsv<float>(shape, engine,
+        GenerateCsv<float>(name, shape, engine,
                            std::uniform_real_distribution<float>(-0.5, 0.5),
                            &input_values[i]);
         break;
       case tensorflow::DT_UINT8:
-        GenerateCsv<uint8_t>(shape, engine,
+        GenerateCsv<uint8_t>(name, shape, engine,
                              std::uniform_int_distribution<uint8_t>(0, 255),
                              &input_values[i]);
         break;
       case tensorflow::DT_INT32:
-        GenerateCsv<int32_t>(shape, engine,
+        GenerateCsv<int32_t>(name, shape, engine,
                              std::uniform_int_distribution<int32_t>(-100, 100),
                              &input_values[i]);
         break;
       case tensorflow::DT_INT64:
-        GenerateCsv<int64_t>(shape, engine,
+        GenerateCsv<int64_t>(name, shape, engine,
                              std::uniform_int_distribution<int64_t>(-100, 100),
                              &input_values[i]);
         break;
       case tensorflow::DT_BOOL:
-        GenerateCsv<int>(shape, engine,
+        GenerateCsv<int>(name, shape, engine,
                          std::uniform_int_distribution<int>(0, 1),
                          &input_values[i]);
         break;
@@ -90,9 +102,22 @@ bool GenerateTestSpecFromRunner(std::iostream& stream, int num_invocations,
                                 const std::vector<string>& input_layer_shape,
                                 const std::vector<string>& output_layer,
                                 TestRunner* runner) {
+  auto input_size = input_layer.size();
+  if (input_layer_shape.size() != input_size ||
+      input_layer_type.size() != input_size) {
+    fprintf(stderr,
+            "Input size not match. Expected %lu, got %lu input types, %lu "
+            "input shapes.\n",
+            input_size, input_layer_type.size(), input_layer_shape.size());
+    return false;
+  }
+
   stream << "reshape {\n";
-  for (const auto& shape : input_layer_shape) {
-    stream << "  input: \"" << shape << "\"\n";
+  for (int i = 0; i < input_size; i++) {
+    const auto& name = input_layer[i];
+    const auto& shape = input_layer_shape[i];
+    stream << "  input { key: \"" << name << "\" value: \"" << shape
+           << "\" }\n";
   }
   stream << "}\n";
 
@@ -101,7 +126,7 @@ bool GenerateTestSpecFromRunner(std::iostream& stream, int num_invocations,
   for (int i = 0; i < num_invocations; ++i) {
     // Note that the input values are random, so each invocation will have a
     // different set.
-    std::vector<string> input_values = GenerateInputValues(
+    auto input_values = GenerateInputValues(
         &random_engine, input_layer, input_layer_type, input_layer_shape);
     if (input_values.empty()) {
       std::cerr << "Unable to generate input values for the TensorFlow model. "
@@ -112,16 +137,7 @@ bool GenerateTestSpecFromRunner(std::iostream& stream, int num_invocations,
     }
 
     // Run TensorFlow.
-    auto inputs = runner->GetInputs();
-    for (int j = 0; j < input_values.size(); j++) {
-      runner->SetInput(inputs[j], input_values[j]);
-      if (!runner->IsValid()) {
-        std::cerr << runner->GetErrorMessage() << std::endl;
-        return false;
-      }
-    }
-
-    runner->Invoke();
+    runner->Invoke(input_values);
     if (!runner->IsValid()) {
       std::cerr << runner->GetErrorMessage() << std::endl;
       return false;
@@ -129,12 +145,13 @@ bool GenerateTestSpecFromRunner(std::iostream& stream, int num_invocations,
 
     // Write second part of test spec, with inputs and outputs.
     stream << "invoke {\n";
-    for (const auto& value : input_values) {
-      stream << "  input: \"" << value << "\"\n";
+    for (const auto& entry : input_values) {
+      stream << "  input { key: \"" << entry.first << "\" value: \""
+             << entry.second << "\" }\n";
     }
-    auto outputs = runner->GetOutputs();
-    for (int j = 0; j < output_layer.size(); j++) {
-      stream << "  output: \"" << runner->ReadOutput(outputs[j]) << "\"\n";
+    for (const auto& name : output_layer) {
+      stream << "  output { key: \"" << name << "\" value: \""
+             << runner->ReadOutput(name) << "\" }\n";
       if (!runner->IsValid()) {
         std::cerr << runner->GetErrorMessage() << std::endl;
         return false;
diff --git a/tensorflow/lite/testing/kernel_test/BUILD b/tensorflow/lite/testing/kernel_test/BUILD
index 921a58713e1d71..2580ddc594cf36 100644
--- a/tensorflow/lite/testing/kernel_test/BUILD
+++ b/tensorflow/lite/testing/kernel_test/BUILD
@@ -35,8 +35,8 @@ tf_cc_test(
     size = "small",
     srcs = ["util_test.cc"],
     data = [
+        ":testdata/test_input.csv",
         "//tensorflow/lite:testdata/add.bin",
-        "//tensorflow/lite:testdata/test_input.csv",
     ],
     deps = [
         ":util",
@@ -69,6 +69,7 @@ cc_library(
     deps = [
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/testing:join",
@@ -81,8 +82,8 @@ cc_test(
     size = "small",
     srcs = ["input_generator_test.cc"],
     data = [
+        ":testdata/test_input.csv",
         "//tensorflow/lite:testdata/multi_add.bin",
-        "//tensorflow/lite:testdata/test_input.csv",
     ],
     deps = [
         ":input_generator",
@@ -96,6 +97,7 @@ cc_library(
     hdrs = ["diff_analyzer.h"],
     deps = [
         "//tensorflow/lite:string",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/testing:split",
     ],
@@ -106,7 +108,7 @@ tf_cc_test(
     size = "small",
     srcs = ["diff_analyzer_test.cc"],
     data = [
-        "//tensorflow/lite:testdata/test_input.csv",
+        ":testdata/test_input.csv",
     ],
     deps = [
         ":diff_analyzer",
diff --git a/tensorflow/lite/testing/kernel_test/diff_analyzer.cc b/tensorflow/lite/testing/kernel_test/diff_analyzer.cc
index 7d6fcc80be17b4..da5f26845fa63b 100644
--- a/tensorflow/lite/testing/kernel_test/diff_analyzer.cc
+++ b/tensorflow/lite/testing/kernel_test/diff_analyzer.cc
@@ -16,6 +16,9 @@ limitations under the License.
 
 #include <cmath>
 #include <fstream>
+#include <string>
+
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/testing/split.h"
 
 namespace tflite {
@@ -54,7 +57,7 @@ float CalculateNormalizedL2Norm(const std::vector<float>& base,
 }
 
 TfLiteStatus Populate(const string& filename,
-                      std::vector<std::vector<float>>* tensors) {
+                      std::unordered_map<string, std::vector<float>>* tensors) {
   if (filename.empty()) {
     fprintf(stderr, "Empty input file name.");
     return kTfLiteError;
@@ -63,7 +66,12 @@ TfLiteStatus Populate(const string& filename,
   std::ifstream file(filename);
   string content;
   while (std::getline(file, content, '\n')) {
-    tensors->push_back(Split<float>(content, ","));
+    auto parts = Split<string>(content, ":");
+    if (parts.size() != 2) {
+      fprintf(stderr, "Expected <name>:<value>, got %s", content.c_str());
+      return kTfLiteError;
+    }
+    tensors->insert(std::make_pair(parts[0], Split<float>(parts[1], ",")));
   }
 
   file.close();
@@ -100,12 +108,17 @@ TfLiteStatus DiffAnalyzer::WriteReport(const string& filename) {
               << ","
               << "Normalized Max Diff"
               << "\n";
-  for (int i = 0; i < base_tensors_.size(); i++) {
+  for (const auto& item : base_tensors_) {
+    const auto& name = item.first;
+    if (!test_tensors_.count(name)) {
+      fprintf(stderr, "Missing tensor %s in test tensors.", name.c_str());
+      continue;
+    }
     float l2_error =
-        CalculateNormalizedL2Norm(base_tensors_[i], test_tensors_[i]);
+        CalculateNormalizedL2Norm(base_tensors_[name], test_tensors_[name]);
     float max_diff =
-        CalculateNormalizedMaxDiff(base_tensors_[i], test_tensors_[i]);
-    output_file << l2_error << "," << max_diff << "\n";
+        CalculateNormalizedMaxDiff(base_tensors_[name], test_tensors_[name]);
+    output_file << name << ":" << l2_error << "," << max_diff << "\n";
   }
 
   output_file.close();
diff --git a/tensorflow/lite/testing/kernel_test/diff_analyzer.h b/tensorflow/lite/testing/kernel_test/diff_analyzer.h
index 6d63775f6b2820..0354e79e1fe0be 100644
--- a/tensorflow/lite/testing/kernel_test/diff_analyzer.h
+++ b/tensorflow/lite/testing/kernel_test/diff_analyzer.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_TESTING_KERNEL_TEST_DIFF_ANALYZER_H_
 #define TENSORFLOW_LITE_TESTING_KERNEL_TEST_DIFF_ANALYZER_H_
 
+#include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "tensorflow/lite/c/common.h"
@@ -28,12 +30,17 @@ namespace testing {
 class DiffAnalyzer {
  public:
   DiffAnalyzer() = default;
+  // Reads base and test tensor values from files.
+  // Each file have lines in <name>:<values> format, where name is the signature
+  // output name and value as comma separated value string.
   TfLiteStatus ReadFiles(const string& base, const string& test);
+  // Writes diff report in <name>:<L2 Error>,<Max Diff> format.
   TfLiteStatus WriteReport(const string& filename);
 
  private:
-  std::vector<std::vector<float>> base_tensors_;
-  std::vector<std::vector<float>> test_tensors_;
+  // Mappings from signature output names to its values.
+  std::unordered_map<string, std::vector<float>> base_tensors_;
+  std::unordered_map<string, std::vector<float>> test_tensors_;
 };
 
 }  // namespace testing
diff --git a/tensorflow/lite/testing/kernel_test/diff_analyzer_test.cc b/tensorflow/lite/testing/kernel_test/diff_analyzer_test.cc
index f6bb3821561c6c..3406cdf5c46b16 100644
--- a/tensorflow/lite/testing/kernel_test/diff_analyzer_test.cc
+++ b/tensorflow/lite/testing/kernel_test/diff_analyzer_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/lite/testing/kernel_test/diff_analyzer.h"
 
 #include <fstream>
+#include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -27,7 +28,8 @@ namespace {
 
 TEST(DiffAnalyzerTest, ZeroDiff) {
   DiffAnalyzer diff_analyzer;
-  string filename = "tensorflow/lite/testdata/test_input.csv";
+  string filename =
+      "tensorflow/lite/testing/kernel_test/testdata/test_input.csv";
   ASSERT_EQ(diff_analyzer.ReadFiles(filename, filename), kTfLiteOk);
 
   string output_file =
@@ -38,7 +40,7 @@ TEST(DiffAnalyzerTest, ZeroDiff) {
   std::ifstream file(output_file);
   std::getline(file, content);
   std::getline(file, content);
-  ASSERT_EQ(content, "0,0");
+  ASSERT_EQ(content, "a:0,0");
 }
 
 }  // namespace
diff --git a/tensorflow/lite/testing/kernel_test/input_generator.cc b/tensorflow/lite/testing/kernel_test/input_generator.cc
index 048f4cc63e6c2f..249ac8305ce9be 100644
--- a/tensorflow/lite/testing/kernel_test/input_generator.cc
+++ b/tensorflow/lite/testing/kernel_test/input_generator.cc
@@ -14,10 +14,15 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/testing/kernel_test/input_generator.h"
 
+#include <cstdio>
 #include <fstream>
 #include <limits>
 #include <random>
+#include <string>
+#include <unordered_map>
+#include <utility>
 
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/testing/join.h"
@@ -27,6 +32,7 @@ namespace tflite {
 namespace testing {
 
 namespace {
+static constexpr char kDefaultServingSignatureDefKey[] = "serving_default";
 
 template <typename T>
 std::vector<T> GenerateRandomTensor(TfLiteIntArray* dims,
@@ -84,6 +90,11 @@ std::vector<T> GenerateGaussian(TfLiteIntArray* dims, float min, float max) {
 }  // namespace
 
 TfLiteStatus InputGenerator::LoadModel(const string& model_dir) {
+  return LoadModel(model_dir, kDefaultServingSignatureDefKey);
+}
+
+TfLiteStatus InputGenerator::LoadModel(const string& model_dir,
+                                       const string& signature) {
   model_ = FlatBufferModel::BuildFromFile(model_dir.c_str());
   if (!model_) {
     fprintf(stderr, "Cannot load model %s", model_dir.c_str());
@@ -96,6 +107,11 @@ TfLiteStatus InputGenerator::LoadModel(const string& model_dir) {
     fprintf(stderr, "Failed to build interpreter.");
     return kTfLiteError;
   }
+  signature_runner_ = interpreter_->GetSignatureRunner(signature.c_str());
+  if (!signature_runner_) {
+    fprintf(stderr, "Failed to get SignatureRunner.\n");
+    return kTfLiteError;
+  }
 
   return kTfLiteOk;
 }
@@ -109,7 +125,12 @@ TfLiteStatus InputGenerator::ReadInputsFromFile(const string& filename) {
   std::ifstream input_file(filename);
   string input;
   while (std::getline(input_file, input, '\n')) {
-    inputs_.push_back(input);
+    std::vector<string> parts = Split<string>(input, ":");
+    if (parts.size() != 2) {
+      fprintf(stderr, "Expected <name>:<value>, got %s", input.c_str());
+      return kTfLiteError;
+    }
+    inputs_.push_back(std::make_pair(parts[0], parts[1]));
   }
   input_file.close();
   return kTfLiteOk;
@@ -129,7 +150,7 @@ TfLiteStatus InputGenerator::WriteInputsToFile(const string& filename) {
   }
 
   for (const auto& input : inputs_) {
-    output_file << input << "\n";
+    output_file << input.first << ":" << input.second << "\n";
   }
   output_file.close();
 
@@ -138,28 +159,31 @@ TfLiteStatus InputGenerator::WriteInputsToFile(const string& filename) {
 
 // TODO(yunluli): Support more tensor types when needed.
 TfLiteStatus InputGenerator::GenerateInput(const string& distribution) {
-  auto input_tensor_ids = interpreter_->inputs();
-  for (auto id : input_tensor_ids) {
-    auto* tensor = interpreter_->tensor(id);
+  auto input_tensor_names = signature_runner_->input_names();
+  for (const char* name : input_tensor_names) {
+    auto* tensor = signature_runner_->input_tensor(name);
     if (distribution == "UNIFORM") {
       switch (tensor->type) {
         case kTfLiteInt8: {
           auto data = GenerateUniform<int8_t>(
               tensor->dims, std::numeric_limits<int8_t>::min(),
               std::numeric_limits<int8_t>::max());
-          inputs_.push_back(Join(data.data(), data.size(), ","));
+          inputs_.push_back(
+              std::make_pair(name, Join(data.data(), data.size(), ",")));
           break;
         }
         case kTfLiteUInt8: {
           auto data = GenerateUniform<uint8_t>(
               tensor->dims, std::numeric_limits<uint8_t>::min(),
               std::numeric_limits<uint8_t>::max());
-          inputs_.push_back(Join(data.data(), data.size(), ","));
+          inputs_.push_back(
+              std::make_pair(name, Join(data.data(), data.size(), ",")));
           break;
         }
         case kTfLiteFloat32: {
           auto data = GenerateUniform<float>(tensor->dims, -1, 1);
-          inputs_.push_back(JoinDefault(data.data(), data.size(), ","));
+          inputs_.push_back(
+              std::make_pair(name, Join(data.data(), data.size(), ",")));
           break;
         }
         default:
@@ -173,19 +197,22 @@ TfLiteStatus InputGenerator::GenerateInput(const string& distribution) {
           auto data = GenerateGaussian<int8_t>(
               tensor->dims, std::numeric_limits<int8_t>::min(),
               std::numeric_limits<int8_t>::max());
-          inputs_.push_back(Join(data.data(), data.size(), ","));
+          inputs_.push_back(
+              std::make_pair(name, Join(data.data(), data.size(), ",")));
           break;
         }
         case kTfLiteUInt8: {
           auto data = GenerateGaussian<uint8_t>(
               tensor->dims, std::numeric_limits<uint8_t>::min(),
               std::numeric_limits<uint8_t>::max());
-          inputs_.push_back(Join(data.data(), data.size(), ","));
+          inputs_.push_back(
+              std::make_pair(name, Join(data.data(), data.size(), ",")));
           break;
         }
         case kTfLiteFloat32: {
           auto data = GenerateGaussian<float>(tensor->dims, -1, 1);
-          inputs_.push_back(JoinDefault(data.data(), data.size(), ","));
+          inputs_.push_back(
+              std::make_pair(name, Join(data.data(), data.size(), ",")));
           break;
         }
         default:
@@ -202,7 +229,5 @@ TfLiteStatus InputGenerator::GenerateInput(const string& distribution) {
   return kTfLiteOk;
 }
 
-std::vector<string> InputGenerator::GetInputs() { return inputs_; }
-
 }  // namespace testing
 }  // namespace tflite
diff --git a/tensorflow/lite/testing/kernel_test/input_generator.h b/tensorflow/lite/testing/kernel_test/input_generator.h
index 6c7ef387678b0c..e7c416e7fcd5a7 100644
--- a/tensorflow/lite/testing/kernel_test/input_generator.h
+++ b/tensorflow/lite/testing/kernel_test/input_generator.h
@@ -16,11 +16,15 @@ limitations under the License.
 #define TENSORFLOW_LITE_TESTING_KERNEL_TEST_INPUT_GENERATOR_H_
 
 #include <memory>
+#include <string>
+#include <utility>
 #include <vector>
 
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/signature_runner.h"
 #include "tensorflow/lite/string_type.h"
 
 namespace tflite {
@@ -33,15 +37,19 @@ class InputGenerator {
  public:
   InputGenerator() = default;
   TfLiteStatus LoadModel(const string& model_dir);
+  TfLiteStatus LoadModel(const string& model_dir, const string& signature);
   TfLiteStatus ReadInputsFromFile(const string& filename);
   TfLiteStatus GenerateInput(const string& distribution);
-  std::vector<string> GetInputs();
+  std::vector<std::pair<string, string>> GetInputs() { return inputs_; }
   TfLiteStatus WriteInputsToFile(const string& filename);
 
  private:
   std::unique_ptr<FlatBufferModel> model_;
   std::unique_ptr<Interpreter> interpreter_;
-  std::vector<string> inputs_;
+  // Not owned.
+  SignatureRunner* signature_runner_ = nullptr;
+  // Mapping from input names to csv string values.
+  std::vector<std::pair<string, string>> inputs_;
 };
 
 }  // namespace testing
diff --git a/tensorflow/lite/testing/kernel_test/input_generator_test.cc b/tensorflow/lite/testing/kernel_test/input_generator_test.cc
index 1719aa7bc96985..f6f1248d8e5195 100644
--- a/tensorflow/lite/testing/kernel_test/input_generator_test.cc
+++ b/tensorflow/lite/testing/kernel_test/input_generator_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 
 #include <fstream>
 #include <map>
+#include <string>
+#include <unordered_map>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -34,16 +36,16 @@ TEST(InputGeneratorTest, LoadModel) {
 
 TEST(InputGeneratorTest, ReadWriteSimpleFile) {
   InputGenerator input_generator;
-  ASSERT_EQ(input_generator.ReadInputsFromFile(
-                "tensorflow/lite/testdata/test_input.csv"),
-            kTfLiteOk);
+  ASSERT_EQ(
+      input_generator.ReadInputsFromFile("tensorflow/lite/testing/"
+                                         "kernel_test/testdata/test_input.csv"),
+      kTfLiteOk);
 
-  std::vector<string> inputs;
   std::string content = "1";
   for (int i = 0; i < 1 * 8 * 8 * 3 - 1; i++) {
     content.append(",1");
   }
-  inputs.push_back(content);
+  std::vector<std::pair<string, string>> inputs = {{"a", content}};
   ASSERT_EQ(input_generator.GetInputs(), inputs);
 
   auto output_filename = ::testing::TempDir() + "/out.csv";
@@ -52,7 +54,9 @@ TEST(InputGeneratorTest, ReadWriteSimpleFile) {
   std::ifstream in(output_filename);
   std::string out;
   std::getline(in, out, '\n');
-  ASSERT_EQ(out, content);
+  std::string expected_out = "a:";
+  expected_out.append(content);
+  ASSERT_EQ(out, expected_out);
 }
 
 TEST(InputGeneratorTest, GenerateUniformInput) {
diff --git a/tensorflow/lite/testdata/test_input.csv b/tensorflow/lite/testing/kernel_test/testdata/test_input.csv
similarity index 82%
rename from tensorflow/lite/testdata/test_input.csv
rename to tensorflow/lite/testing/kernel_test/testdata/test_input.csv
index 33894d3063f35a..e1ae4cd84b8f15 100644
--- a/tensorflow/lite/testdata/test_input.csv
+++ b/tensorflow/lite/testing/kernel_test/testdata/test_input.csv
@@ -1 +1 @@
-1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
\ No newline at end of file
+a:1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
diff --git a/tensorflow/lite/testing/kernel_test/util.h b/tensorflow/lite/testing/kernel_test/util.h
index 882cc015f58f2b..df2506a5ad2899 100644
--- a/tensorflow/lite/testing/kernel_test/util.h
+++ b/tensorflow/lite/testing/kernel_test/util.h
@@ -80,18 +80,9 @@ TfLiteStatus RunKernelTest(const kernel_test::TestOptions& options,
   runner->LoadModel(options.tflite_model);
   runner->AllocateTensors();
   if (!runner->IsValid()) return kTfLiteError;
-  auto input_tensor_ids = runner->GetInputs();
   auto inputs = input_generator.GetInputs();
-  if (inputs.size() != input_tensor_ids.size()) {
-    fprintf(stderr,
-            "Number of input tensors generated doesn't match what the model "
-            "asks for.");
-  }
-  for (int i = 0; i < inputs.size(); i++) {
-    runner->SetInput(input_tensor_ids[i], inputs[i]);
-  }
 
-  runner->Invoke();
+  runner->Invoke(inputs);
 
   if (!options.dump_input_to_file.empty()) {
     TF_LITE_ENSURE_STATUS(
@@ -106,8 +97,8 @@ TfLiteStatus RunKernelTest(const kernel_test::TestOptions& options,
       return kTfLiteError;
     }
 
-    for (auto id : runner->GetOutputs()) {
-      output_file << runner->ReadOutput(id) << "\n";
+    for (const auto& name : runner->GetOutputNames()) {
+      output_file << name << ":" << runner->ReadOutput(name) << "\n";
     }
     output_file.close();
   }
diff --git a/tensorflow/lite/testing/kernel_test/util_test.cc b/tensorflow/lite/testing/kernel_test/util_test.cc
index d9d8391bba158c..59d75931079600 100644
--- a/tensorflow/lite/testing/kernel_test/util_test.cc
+++ b/tensorflow/lite/testing/kernel_test/util_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <fstream>
 #include <memory>
+#include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -30,13 +31,13 @@ TEST(UtilTest, SimpleE2ETest) {
   TestOptions options;
   options.tflite_model = "tensorflow/lite/testdata/add.bin";
   options.read_input_from_file =
-      "tensorflow/lite/testdata/test_input.csv";
+      "tensorflow/lite/testing/kernel_test/testdata/test_input.csv";
   options.dump_output_to_file = ::testing::TempDir() + "/test_out.csv";
   options.kernel_type = "REFERENCE";
   std::unique_ptr<TestRunner> runner(new TfLiteDriver(
       TfLiteDriver::DelegateType::kNone, /*reference_kernel=*/true));
   RunKernelTest(options, runner.get());
-  std::string expected = "3";
+  std::string expected = "x:3";
   for (int i = 0; i < 1 * 8 * 8 * 3 - 1; i++) {
     expected.append(",3");
   }
diff --git a/tensorflow/lite/testing/op_tests/cast.py b/tensorflow/lite/testing/op_tests/cast.py
index ae66d08cf5b1b8..b0a0f6b2ed683b 100644
--- a/tensorflow/lite/testing/op_tests/cast.py
+++ b/tensorflow/lite/testing/op_tests/cast.py
@@ -35,10 +35,26 @@ def make_cast_tests(options):
         "input_dtype": [tf.int32],
         "output_dtype": [tf.float32],
         "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+    }, {
+        "input_dtype": [tf.int8],
+        "output_dtype": [tf.float32],
+        "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+    }, {
+        "input_dtype": [tf.float32],
+        "output_dtype": [tf.int8],
+        "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
     }, {
         "input_dtype": [tf.uint32],
         "output_dtype": [tf.int32],
         "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+    }, {
+        "input_dtype": [tf.uint8],
+        "output_dtype": [tf.int8],
+        "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+    }, {
+        "input_dtype": [tf.int8],
+        "output_dtype": [tf.uint8],
+        "input_shape": [[], [1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
     }]
   else:
     test_parameters = [
diff --git a/tensorflow/lite/testing/op_tests/constant.py b/tensorflow/lite/testing/op_tests/constant.py
index c6f5ad93e893cc..763b68346d3776 100644
--- a/tensorflow/lite/testing/op_tests/constant.py
+++ b/tensorflow/lite/testing/op_tests/constant.py
@@ -31,9 +31,7 @@ def make_constant_tests(options):
       "dtype": [tf.float32, tf.int32],
       "input_shape": [[], [1], [2], [1, 1, 1, 1], [2, 2, 2, 2]],
       "constant_is_also_output": [True, False],
-      # TODO(b/192473002) investigate if it can be removed for MLIR converter.
-      # This is a regression test for a bug where Toco rejects models with
-      # unread inputs.
+      # Models should not be rejected regardless whether it has unread inputs.
       "has_unread_input": [True, False],
   }]
 
diff --git a/tensorflow/lite/testing/parse_testdata.cc b/tensorflow/lite/testing/parse_testdata.cc
index 166e8b4cb95e0a..5d699a55283435 100644
--- a/tensorflow/lite/testing/parse_testdata.cc
+++ b/tensorflow/lite/testing/parse_testdata.cc
@@ -25,6 +25,8 @@ limitations under the License.
 #include <fstream>
 #include <iostream>
 #include <streambuf>
+#include <utility>
+#include <vector>
 
 #include "tensorflow/lite/error_reporter.h"
 #include "tensorflow/lite/testing/message.h"
@@ -34,6 +36,8 @@ namespace tflite {
 namespace testing {
 namespace {
 
+const char kDefaultSignatureKey[] = "serving_default";
+
 // Fatal error if parse error occurs
 #define PARSE_CHECK_EQ(filename, current_line, x, y)                         \
   if ((x) != (y)) {                                                          \
@@ -245,85 +249,127 @@ TfLiteStatus CheckOutputs(tflite::Interpreter* interpreter,
   return kTfLiteOk;
 }
 
-// Process an 'invoke' message, triggering execution of the test runner, as
+// Processes Protobuf map<string, string> like message.
+// Supports format of
+// field_name {key: "KEY1" value: "VAL1"}
+// field_name {key: "KEY2" value: "VAL2"}
+// field_name {key: "KEY3" value: "VAL3"}
+//
+// for field `map<string, string> field_name = TAG;`
+//
+// Note: The parent of this field should track the ownership of the repeated
+// field. By calling KvMap::Finish() means a new entry is added to the map
+// instead of finish parsing of the whole map.
+class KvMap : public Message, public std::vector<std::pair<string, string>> {
+ public:
+  void SetField(const std::string& name, const std::string& value) override {
+    if (name == "key") {
+      key_ = value;
+    } else if (name == "value") {
+      value_ = value;
+    }
+  }
+  void Finish() override {
+    push_back(std::make_pair(key_, value_));
+    key_.clear();
+    value_.clear();
+  }
+
+ private:
+  string key_;
+  string value_;
+};
+
+// Processes an 'invoke' message, triggering execution of the test runner, as
 // well as verification of outputs. An 'invoke' message looks like:
 //   invoke {
-//     id: xyz
-//     input: 1,2,1,1,1,2,3,4
-//     output: 4,5,6
+//     id: "xyz"
+//     input { key: "a" value: "1,2,1,1,1,2,3,4"}
+//     input { key: "b" value: "1,2,1,1,1,2,3,4"}
+//     output { key: "x" value: "4,5,6"}
+//     output { key: "y" value: "14,15,16"}
+//     output_shape { key: "x" value: "3"}
+//     output_shape { key: "y" value: "1,3"}
 //   }
 class Invoke : public Message {
  public:
-  explicit Invoke(TestRunner* test_runner) : test_runner_(test_runner) {
-    expected_inputs_ = test_runner->GetInputs();
-    expected_outputs_ = test_runner->GetOutputs();
-  }
+  explicit Invoke(TestRunner* test_runner) : test_runner_(test_runner) {}
 
   void SetField(const std::string& name, const std::string& value) override {
     if (name == "id") {
       test_runner_->SetInvocationId(value);
-    } else if (name == "input") {
-      if (parsed_input_count_ >= expected_inputs_.size()) {
-        return test_runner_->Invalidate("Too many inputs");
-      }
-      test_runner_->SetInput(expected_inputs_[parsed_input_count_], value);
-      ++parsed_input_count_;
-    } else if (name == "output") {
-      if (parsed_output_count_ >= expected_outputs_.size()) {
-        return test_runner_->Invalidate("Too many outputs");
-      }
-      test_runner_->SetExpectation(expected_outputs_[parsed_output_count_],
-                                   value);
-      ++parsed_output_count_;
-    } else if (name == "output_shape") {
-      if (parsed_output_shape_count_ >= expected_outputs_.size()) {
-        return test_runner_->Invalidate("Too many output shapes");
-      }
-      test_runner_->SetShapeExpectation(
-          expected_outputs_[parsed_output_shape_count_], value);
-      ++parsed_output_shape_count_;
     }
   }
+
+  Message* AddChild(const std::string& s) override {
+    if (s == "input") {
+      return MaybeInitializeChild(&inputs_);
+    } else if (s == "output") {
+      return MaybeInitializeChild(&expected_outputs_);
+    } else if (s == "output_shape") {
+      return MaybeInitializeChild(&expected_output_shapes_);
+    }
+    return nullptr;
+  }
+
+  // Invokes the test runner and checks expectations.
   void Finish() override {
-    test_runner_->Invoke();
-    test_runner_->CheckResults();
+    using VectorT = std::vector<std::pair<string, string>>;
+    test_runner_->Invoke(inputs_ ? *inputs_ : VectorT());
+    test_runner_->CheckResults(
+        expected_outputs_ ? *expected_outputs_ : VectorT(),
+        expected_output_shapes_ ? *expected_output_shapes_ : VectorT());
   }
 
  private:
-  std::vector<int> expected_inputs_;
-  std::vector<int> expected_outputs_;
-
-  int parsed_input_count_ = 0;
-  int parsed_output_count_ = 0;
-  int parsed_output_shape_count_ = 0;
+  // Checks whether `*child` is initialized and return the message pointer.
+  // Initializes and owns it if it's not initialized.
+  Message* MaybeInitializeChild(KvMap** child) {
+    if (*child == nullptr) {
+      *child = new KvMap;
+      Store(*child);
+    }
+    return *child;
+  }
 
   TestRunner* test_runner_;
+
+  KvMap* inputs_ = nullptr;
+  KvMap* expected_outputs_ = nullptr;
+  KvMap* expected_output_shapes_ = nullptr;
 };
 
 // Process an 'reshape' message, triggering resizing of the input tensors via
 // the test runner. A 'reshape' message looks like:
 //   reshape {
-//     input: 1,2,1,1,1,2,3,4
+//     input { key: "a" value: "1,2,1,1,1,2,3,4"}
+//     input { key: "b" value: "1,2,1,1,1,2,3,4"}
 //   }
 class Reshape : public Message {
  public:
-  explicit Reshape(TestRunner* test_runner) : test_runner_(test_runner) {
-    expected_inputs_ = test_runner->GetInputs();
+  explicit Reshape(TestRunner* test_runner) : test_runner_(test_runner) {}
+
+  Message* AddChild(const std::string& s) override {
+    if (s != "input") return nullptr;
+    if (input_shapes_ == nullptr) {
+      input_shapes_ = new KvMap;
+      Store(input_shapes_);
+    }
+    return input_shapes_;
   }
 
-  void SetField(const std::string& name, const std::string& value) override {
-    if (name == "input") {
-      if (expected_inputs_.empty()) {
-        return test_runner_->Invalidate("Too many inputs to reshape");
-      }
-      test_runner_->ReshapeTensor(*expected_inputs_.begin(), value);
-      expected_inputs_.erase(expected_inputs_.begin());
+  // Reshapes tensors.
+  void Finish() override {
+    if (!input_shapes_) return;
+    for (const auto& item : *input_shapes_) {
+      test_runner_->ReshapeTensor(item.first, item.second);
     }
   }
 
  private:
-  std::vector<int> expected_inputs_;
   TestRunner* test_runner_;
+
+  KvMap* input_shapes_ = nullptr;
 };
 
 // This is the top-level message in a test file.
@@ -334,11 +380,11 @@ class TestData : public Message {
   void SetMaxInvocations(int max) { max_invocations_ = max; }
   void SetField(const std::string& name, const std::string& value) override {
     if (name == "load_model") {
-      test_runner_->LoadModel(value);
+      test_runner_->LoadModel(value, kDefaultSignatureKey);
     } else if (name == "init_state") {
       test_runner_->AllocateTensors();
-      for (int id : Split<int>(value, ",")) {
-        test_runner_->ResetTensor(id);
+      for (const auto& name : Split<string>(value, ",")) {
+        test_runner_->ResetTensor(name);
       }
     }
   }
diff --git a/tensorflow/lite/testing/parse_testdata.h b/tensorflow/lite/testing/parse_testdata.h
index 759af98a62051a..d16de8b517d1ca 100644
--- a/tensorflow/lite/testing/parse_testdata.h
+++ b/tensorflow/lite/testing/parse_testdata.h
@@ -46,27 +46,77 @@ TfLiteStatus FeedExample(tflite::Interpreter* interpreter, const Example&);
 TfLiteStatus CheckOutputs(tflite::Interpreter* interpreter, const Example&);
 
 // Parses a test description and feeds the given test runner with data.
-// The input format is similar to an ASCII proto:
+// The input format is similar to a proto with the following schema:
+//
+// message TestMessage {
+//   // Path to the model to load.
+//   string load_model = 1;
+//   // Names to initialize the tensor with zeros.
+//   string init_state = 2;
+//   message Reshape {
+//     // Name of the input and csv string of shape of it.
+//     map<string, string> input = 1;
+//   }
+//   repeated Reshape reshape = 3;
+//   message Invoke {
+//     // Name of this invoke.
+//     string id = 1;
+//     // Name of the input to the csv string of input value.
+//     map<string, string> input = 2;
+//     // Name of the output to the csv string of expected output value.
+//     map<string, string> output = 3;
+//     // Name of the output to the csv string of expected output shape.
+//     map<string, string> output_shape = 4;
+//   }
+//   repeated Invoke invoke = 4;
+// }
+//
+// An example of the ASCII proto:
 //   // Loads model 'add.bin' from the TestRunner's model directory.
 //   load_model: "add.bin"
 //   // Changes the shape of inputs, provided in the same order they appear
-//   // in the model.
+//   // in the model, or `input_names` if specified.
 //   reshape {
-//     input: "1,224,224,3"
-//     input: "1,3,4,1"
+//     input {
+//       key: "a"
+//       value: "1,224,224,3"
+//     }
+//     input {
+//       key: "b"
+//       value: "1,3,4,1"
+//     }
 //   }
 //   // Fills the given persistent tensors with zeros.
-//   init_state: 0,1,2,3
+//   init_state: "a,b,c,d"
 //   // Invokes the interpreter with the given input and checks that it
 //   // produces the expected output. Inputs and outputs should be specified in
-//   // the order they appear in the model.
+//   // the order they appear in the model, or `input_names` and `output_names`
+//   // if specified.
 //   invoke {
-//     input: "1,2,3,4,56"
-//     input: "0.1,0.2,0.3,4.3,56.4"
-//     output: "12,3,4,545,3,6"
-//     output: "0.01,0.02"
-//     output_shape: "2,3"
-//     output_shape: "1"
+//     input {
+//       key: "a"
+//       value: "1,2,3,4,56"
+//     }
+//     input {
+//       key: "b"
+//       value: "0.1,0.2,0.3,4.3,56.4"
+//     }
+//     output {
+//       key: "x"
+//       value: "12,3,4,545,3,6"
+//     }
+//     output {
+//       key: "y"
+//       value: "0.01,0.02"
+//     }
+//     output_shape {
+//       key: "x"
+//       value: "2,3"
+//     }
+//     output_shape {
+//       key: "y"
+//       value: "1"
+//     }
 //   }
 bool ParseAndRunTests(std::istream* input, TestRunner* test_runner,
                       int max_invocations = -1);
diff --git a/tensorflow/lite/testing/test_runner.h b/tensorflow/lite/testing/test_runner.h
index bb278d59c2cdcb..60c79ce02b3d14 100644
--- a/tensorflow/lite/testing/test_runner.h
+++ b/tensorflow/lite/testing/test_runner.h
@@ -65,55 +65,12 @@ class TestRunner {
       const std::vector<std::pair<string, string>>& expected_outputs,
       const std::vector<std::pair<string, string>>& expected_output_shapes) = 0;
 
-  // The following methods access tensors by index.
-  // DEPRECATED: use methods with signature instead.
-  // TODO(b/205171855): Clean up the functions after no usages.
-  //
-  // Returns the list of input tensors in the loaded model.
-  virtual const std::vector<int>& GetInputs() = 0;
-
-  // Returns the list of output tensors in the loaded model.
-  virtual const std::vector<int>& GetOutputs() = 0;
-
-  // Prepares for a run by resize the given tensor. The given 'id' is
-  // guaranteed to be one of the ids returned by GetInputs().
-  virtual void ReshapeTensor(int id, const string& csv_values) = 0;
+  // Returns the list of output names in the loaded model for given signature.
+  virtual std::vector<string> GetOutputNames() = 0;
 
   // Reserves memory for all tensors.
   virtual void AllocateTensors() = 0;
 
-  // Sets the given tensor to some initial state, usually zero. This is
-  // used to reset persistent buffers in a model.
-  virtual void ResetTensor(int id) = 0;
-
-  // Defines the contents of the given input tensor. The given 'id' is
-  // guaranteed to be one of the ids returned by GetInputs().
-  virtual void SetInput(int id, const string& values_as_string) = 0;
-
-  // Defines what should be expected data for an output tensor after Invoke()
-  // runs.
-  // The given 'id' is guaranteed to be one of the ids returned by
-  // GetOutputs().
-  virtual void SetExpectation(int id, const string& values_as_string) = 0;
-
-  // Defines what should be expected shape for an output tensor after Invoke()
-  // runs.
-  // The given 'id' is guaranteed to be one of the ids returned by
-  // GetOutputs().
-  virtual void SetShapeExpectation(int id, const string& values_as_string) = 0;
-
-  // Runs the model.
-  virtual void Invoke() = 0;
-
-  // Verifies that the contents of all outputs conform to the existing
-  // expectations. Return true if there are no expectations or they are all
-  // satisfied.
-  virtual bool CheckResults() = 0;
-
-  // Reads the value of the output tensor and format it into a csv string.
-  // The given 'id' is guaranteed to be one of the ids returned by GetOutputs().
-  virtual string ReadOutput(int id) = 0;
-
   // Sets the base path for loading models.
   void SetModelBaseDir(const string& path) {
     model_base_dir_ = path;
diff --git a/tensorflow/lite/testing/test_runner_test.cc b/tensorflow/lite/testing/test_runner_test.cc
index 5a19ad6e36bc2b..4f2478b0f71fc6 100644
--- a/tensorflow/lite/testing/test_runner_test.cc
+++ b/tensorflow/lite/testing/test_runner_test.cc
@@ -24,17 +24,7 @@ namespace {
 class ConcreteTestRunner : public TestRunner {
  public:
   void LoadModel(const string& bin_file_path) override {}
-  const std::vector<int>& GetInputs() override { return ids_; }
-  const std::vector<int>& GetOutputs() override { return ids_; }
-  void ReshapeTensor(int id, const string& csv_values) override {}
   void AllocateTensors() override {}
-  void ResetTensor(int id) override {}
-  void SetInput(int id, const string& csv_values) override {}
-  void SetExpectation(int id, const string& csv_values) override {}
-  void SetShapeExpectation(int id, const string& csv_values) override {}
-  string ReadOutput(int id) override { return ""; }
-  void Invoke() override {}
-  bool CheckResults() override { return true; }
   bool CheckFloatSizes(size_t bytes, size_t values) {
     return CheckSizes<float>(bytes, values);
   }
@@ -50,6 +40,7 @@ class ConcreteTestRunner : public TestRunner {
       override {
     return true;
   }
+  std::vector<string> GetOutputNames() override { return {}; }
 
  private:
   std::vector<int> ids_;
diff --git a/tensorflow/lite/testing/tf_driver.cc b/tensorflow/lite/testing/tf_driver.cc
index fadcc6402d6647..62696cc5798d80 100644
--- a/tensorflow/lite/testing/tf_driver.cc
+++ b/tensorflow/lite/testing/tf_driver.cc
@@ -153,19 +153,52 @@ void TfDriver::LoadModel(const string& bin_file_path) {
 }
 
 void TfDriver::ReshapeTensor(const string& name, const string& csv_values) {
-  ReshapeTensor(input_name_to_id_[name], csv_values);
+  if (!IsValid()) return;
+  int id = input_name_to_id_[name];
+  input_shapes_[id] = Split<int64_t>(csv_values, ",");
+  input_tensors_[input_names_[id]] =
+      CreateTensor(input_types_[id], input_shapes_[id]);
+  ResetTensor(name);
 }
+
 void TfDriver::ResetTensor(const std::string& name) {
-  ResetTensor(input_name_to_id_[name]);
+  if (!IsValid()) return;
+  int id = input_name_to_id_[name];
+  auto tensor = input_tensors_[input_names_[id]];
+  switch (input_types_[id]) {
+    case tensorflow::DT_FLOAT: {
+      FillTensorWithZeros<float>(&tensor);
+      break;
+    }
+    case tensorflow::DT_INT32: {
+      FillTensorWithZeros<int32_t>(&tensor);
+      break;
+    }
+    default:
+      Invalidate(absl::StrCat("Unsupported tensor type ", input_types_[id],
+                              tensorflow::DataType_Name(input_types_[id]),
+                              " in ResetInput"));
+      return;
+  }
 }
 string TfDriver::ReadOutput(const string& name) {
-  return ReadOutput(output_name_to_id_[name]);
+  if (!IsValid()) return "";
+  return ReadOutput(output_tensors_[output_name_to_id_[name]]);
 }
 void TfDriver::Invoke(const std::vector<std::pair<string, string>>& inputs) {
+  if (!IsValid()) return;
   for (const auto& input : inputs) {
-    SetInput(input_name_to_id_[input.first], input.second);
+    auto id = input_name_to_id_[input.first];
+    auto tensor = CreateTensor(input_types_[id], input_shapes_[id]);
+    SetInput(input.second, &tensor);
+    input_tensors_[input_names_[id]] = tensor;
+  }
+  auto status = session_->Run({input_tensors_.begin(), input_tensors_.end()},
+                              output_names_, {}, &output_tensors_);
+  if (!status.ok()) {
+    Invalidate(absl::StrCat("TensorFlow failed to run graph:",
+                            status.error_message()));
   }
-  Invoke();
 }
 
 void TfDriver::SetInput(const string& values_as_string,
@@ -206,40 +239,6 @@ void TfDriver::SetInput(const string& values_as_string,
   }
 }
 
-void TfDriver::SetInput(int id, const string& values_as_string) {
-  if (!IsValid()) return;
-  auto tensor = CreateTensor(input_types_[id], input_shapes_[id]);
-  SetInput(values_as_string, &tensor);
-  input_tensors_[input_names_[id]] = tensor;
-}
-
-void TfDriver::ResetTensor(int id) {
-  if (!IsValid()) return;
-  auto tensor = input_tensors_[input_names_[id]];
-  switch (input_types_[id]) {
-    case tensorflow::DT_FLOAT: {
-      FillTensorWithZeros<float>(&tensor);
-      break;
-    }
-    case tensorflow::DT_INT32: {
-      FillTensorWithZeros<int32_t>(&tensor);
-      break;
-    }
-    default:
-      Invalidate(absl::StrCat("Unsupported tensor type ", input_types_[id],
-                              tensorflow::DataType_Name(input_types_[id]),
-                              " in ResetInput"));
-      return;
-  }
-}
-
-void TfDriver::ReshapeTensor(int id, const string& values_as_string) {
-  input_shapes_[id] = Split<int64_t>(values_as_string, ",");
-  input_tensors_[input_names_[id]] =
-      CreateTensor(input_types_[id], input_shapes_[id]);
-  ResetTensor(id);
-}
-
 string TfDriver::ReadOutput(const tensorflow::Tensor& tensor) {
   switch (tensor.dtype()) {
     case tensorflow::DT_FLOAT:
@@ -264,20 +263,5 @@ string TfDriver::ReadOutput(const tensorflow::Tensor& tensor) {
   }
 }
 
-string TfDriver::ReadOutput(int id) {
-  if (!IsValid()) return "";
-  return ReadOutput(output_tensors_[id]);
-}
-
-void TfDriver::Invoke() {
-  if (!IsValid()) return;
-  auto status = session_->Run({input_tensors_.begin(), input_tensors_.end()},
-                              output_names_, {}, &output_tensors_);
-  if (!status.ok()) {
-    Invalidate(absl::StrCat("TensorFlow failed to run graph:",
-                            status.error_message()));
-  }
-}
-
 }  // namespace testing
 }  // namespace tflite
diff --git a/tensorflow/lite/testing/tf_driver.h b/tensorflow/lite/testing/tf_driver.h
index ceb7a53658a33b..0d9c6c20d4f3ef 100644
--- a/tensorflow/lite/testing/tf_driver.h
+++ b/tensorflow/lite/testing/tf_driver.h
@@ -58,25 +58,10 @@ class TfDriver : public TestRunner {
       override {
     return true;
   }
-
-  void SetInput(int id, const string& values_as_string) override;
-  void Invoke() override;
-  string ReadOutput(int id) override;
-
-  const std::vector<int>& GetInputs() override { return input_ids_; }
-  const std::vector<int>& GetOutputs() override { return output_ids_; }
-  void ReshapeTensor(int id, const string& values_as_string) override;
-  // Note: ResetTensor only works for input tensor.
-  void ResetTensor(int id) override;
+  std::vector<string> GetOutputNames() override { return output_names_; }
 
   // no-op. SetInput will overwrite existing data .
   void AllocateTensors() override {}
-  // no-op. Tf driver is not supposed to check the results.
-  void SetExpectation(int id, const string& values_as_string) override {}
-  // no-op. Tf driver is not supposed to check the results.
-  void SetShapeExpectation(int id, const string& values_as_string) override {}
-  // tf driver is not supposed to check the results.
-  bool CheckResults() override { return false; }
 
  protected:
   void SetInput(const string& values_as_string, tensorflow::Tensor*);
diff --git a/tensorflow/lite/testing/tf_driver_test.cc b/tensorflow/lite/testing/tf_driver_test.cc
index e62035a04bac2e..1bdb9c8117acd0 100644
--- a/tensorflow/lite/testing/tf_driver_test.cc
+++ b/tensorflow/lite/testing/tf_driver_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/lite/testing/tf_driver.h"
 
 #include <algorithm>
+#include <string>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
@@ -25,8 +26,6 @@ namespace tflite {
 namespace testing {
 namespace {
 
-using ::testing::ElementsAre;
-
 class TestDriver : public TfDriver {
  public:
   // No need for a full TfDriver. We just want to test the read/write methods.
@@ -88,7 +87,7 @@ TEST(TfDriverTest, ReadingAndWritingValuesStrings) {
             buffer);
 }
 
-TEST(TfDriverTest, SimpleTestBySignature) {
+TEST(TfDriverTest, SimpleTest) {
   std::unique_ptr<TfDriver> runner(
       new TfDriver({"a", "b", "c", "d"}, {"float", "float", "float", "float"},
                    {"1,8,8,3", "1,8,8,3", "1,8,8,3", "1,8,8,3"}, {"x", "y"}));
@@ -111,36 +110,6 @@ TEST(TfDriverTest, SimpleTestBySignature) {
             "0.0109999999,0.0219999999,0.0329999998,0.0439999998");
 }
 
-TEST(TfDriverTest, SimpleTestById) {
-  std::unique_ptr<TfDriver> runner(
-      new TfDriver({"a", "b", "c", "d"}, {"float", "float", "float", "float"},
-                   {"1,8,8,3", "1,8,8,3", "1,8,8,3", "1,8,8,3"}, {"x", "y"}));
-
-  runner->LoadModel(
-      "tensorflow/lite/testdata/multi_add.pb");
-  EXPECT_TRUE(runner->IsValid()) << runner->GetErrorMessage();
-
-  ASSERT_THAT(runner->GetInputs(), ElementsAre(0, 1, 2, 3));
-  ASSERT_THAT(runner->GetOutputs(), ElementsAre(0, 1));
-
-  for (int i : {0, 1, 2, 3}) {
-    runner->ReshapeTensor(i, "1,2,2,1");
-  }
-  ASSERT_TRUE(runner->IsValid());
-
-  runner->SetInput(0, "0.1,0.2,0.3,0.4");
-  runner->SetInput(1, "0.001,0.002,0.003,0.004");
-  runner->SetInput(2, "0.001,0.002,0.003,0.004");
-  runner->SetInput(3, "0.01,0.02,0.03,0.04");
-  runner->ResetTensor(2);
-  runner->Invoke();
-
-  ASSERT_EQ(runner->ReadOutput(0),
-            "0.101000004,0.202000007,0.303000003,0.404000014");
-  ASSERT_EQ(runner->ReadOutput(1),
-            "0.0109999999,0.0219999999,0.0329999998,0.0439999998");
-}
-
 }  // namespace
 }  // namespace testing
 }  // namespace tflite
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index 1ff4bf93f3189b..d2085b4f4772d3 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <iterator>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/strings/escaping.h"
@@ -480,29 +481,12 @@ void TfLiteDriver::LoadModel(const string& bin_file_path,
 
   must_allocate_tensors_ = true;
 
-  // The order of inputs and outputs must match the order in "*_tests.txt" and
-  // "*.inputs" files.
-  // TODO(b/192473002): Run the interpreter using signature instead of indexes.
   signature_runner_ = interpreter_->GetSignatureRunner(signature.c_str());
   if (signature_runner_) {
     signature_inputs_ = interpreter_->signature_inputs(signature.c_str());
-    for (const char* name : signature_runner_->input_names()) {
-      inputs_.push_back(signature_inputs_.at(name));
-    }
     signature_outputs_ = interpreter_->signature_outputs(signature.c_str());
-    for (const char* name : signature_runner_->output_names()) {
-      outputs_.push_back(signature_outputs_.at(name));
-    }
-  }
-
-  // Uses the default order when there is no signature.
-  if (inputs_.empty()) {
-    inputs_.insert(inputs_.end(), interpreter_->inputs().begin(),
-                   interpreter_->inputs().end());
-  }
-  if (outputs_.empty()) {
-    outputs_.insert(outputs_.end(), interpreter_->outputs().begin(),
-                    interpreter_->outputs().end());
+  } else {
+    Invalidate("Unable to the fetch signature runner.");
   }
 }
 
@@ -511,7 +495,7 @@ void TfLiteDriver::LoadModel(const string& bin_file_path) {
 }
 
 void TfLiteDriver::ReshapeTensor(const string& name, const string& csv_values) {
-  if (!(IsValid() && signature_runner_)) return;
+  if (!IsValid()) return;
   if (signature_runner_->ResizeInputTensor(
           name.c_str(), testing::Split<int>(csv_values, ",")) != kTfLiteOk) {
     Invalidate("Failed to resize input tensor " + name);
@@ -521,16 +505,16 @@ void TfLiteDriver::ReshapeTensor(const string& name, const string& csv_values) {
 }
 
 void TfLiteDriver::ResetTensor(const std::string& name) {
-  if (!(IsValid() && signature_runner_)) return;
+  if (!IsValid()) return;
   auto* tensor = signature_runner_->input_tensor(name.c_str());
   memset(tensor->data.raw, 0, tensor->bytes);
 }
 
 void TfLiteDriver::Invoke(
     const std::vector<std::pair<string, string>>& inputs) {
-  if (!(IsValid() && signature_runner_)) return;
+  if (!IsValid()) return;
   for (const auto& input : inputs) {
-    SetInput(signature_inputs_[input.first], input.second);
+    SetInput(input.first, input.second);
   }
   if (signature_runner_->Invoke() != kTfLiteOk) {
     Invalidate("Failed to invoke interpreter");
@@ -538,42 +522,69 @@ void TfLiteDriver::Invoke(
 }
 
 string TfLiteDriver::ReadOutput(const string& name) {
-  if (!(IsValid() && signature_runner_)) return "";
+  if (!IsValid()) return "";
   return TensorValueToCsvString(signature_runner_->output_tensor(name.c_str()));
 }
 
 bool TfLiteDriver::CheckResults(
     const std::vector<std::pair<string, string>>& expected_outputs,
     const std::vector<std::pair<string, string>>& expected_output_shapes) {
-  if (!(IsValid() && signature_runner_)) return false;
+  if (!IsValid()) return false;
+  bool success = true;
   for (const auto& output : expected_outputs) {
-    SetExpectation(signature_outputs_[output.first], output.second);
+    SetExpectation(output.first, output.second);
   }
   for (const auto& shape : expected_output_shapes) {
-    SetShapeExpectation(signature_outputs_[shape.first], shape.second);
+    SetShapeExpectation(shape.first, shape.second);
   }
-  return CheckResults();
-}
 
-void TfLiteDriver::ResetTensor(int id) {
-  if (!IsValid()) return;
-  auto* tensor = interpreter_->tensor(id);
-  memset(tensor->data.raw, 0, tensor->bytes);
+  for (const auto& p : expected_output_) {
+    int id = p.first;
+    auto* tensor = interpreter_->tensor(id);
+    if (!p.second->Check(/*verbose=*/false, *tensor)) {
+      // Do not invalidate anything here. Instead, simply output the
+      // differences and return false. Invalidating would prevent all
+      // subsequent invocations from running..
+      std::cerr << "TfLiteDriver: There were errors in invocation '"
+                << GetInvocationId() << "', validating output tensor '" << id
+                << "':" << std::endl;
+      p.second->Check(/*verbose=*/true, *tensor);
+      success = false;
+      SetOverallSuccess(false);
+    }
+  }
+  for (const auto& p : expected_output_shape_) {
+    int id = p.first;
+    auto* tensor = interpreter_->tensor(id);
+    if (!p.second->CheckShape(/*verbose=*/false, *tensor)) {
+      // Do not invalidate anything here. Instead, simply output the
+      // differences and return false. Invalidating would prevent all
+      // subsequent invocations from running..
+      std::cerr << "TfLiteDriver: There were errors in invocation '"
+                << GetInvocationId()
+                << "', validating the shape of output tensor '" << id
+                << "':" << std::endl;
+      p.second->CheckShape(/*verbose=*/true, *tensor);
+      success = false;
+      SetOverallSuccess(false);
+    }
+  }
+  expected_output_.clear();
+  return success;
 }
 
-void TfLiteDriver::ReshapeTensor(int id, const string& csv_values) {
-  if (!IsValid()) return;
-  if (interpreter_->ResizeInputTensor(
-          id, testing::Split<int>(csv_values, ",")) != kTfLiteOk) {
-    Invalidate("Failed to resize input tensor " + std::to_string(id));
-    return;
+std::vector<string> TfLiteDriver::GetOutputNames() {
+  if (!IsValid()) return {};
+  std::vector<string> names;
+  for (const auto* name : signature_runner_->output_names()) {
+    names.push_back(name);
   }
-  must_allocate_tensors_ = true;
+  return names;
 }
 
-void TfLiteDriver::SetInput(int id, const string& csv_values) {
-  if (!IsValid()) return;
-  auto* tensor = interpreter_->tensor(id);
+void TfLiteDriver::SetInput(const string& name, const string& csv_values) {
+  auto id = signature_inputs_[name];
+  auto* tensor = signature_runner_->input_tensor(name.c_str());
   switch (tensor->type) {
     case kTfLiteFloat64: {
       const auto& values = testing::Split<double>(csv_values, ",");
@@ -678,9 +689,10 @@ void TfLiteDriver::SetQuantizationErrorMultiplier(
   quantization_error_multiplier_ = quantization_error_multiplier;
 }
 
-void TfLiteDriver::SetExpectation(int id, const string& csv_values) {
-  if (!IsValid()) return;
-  auto* tensor = interpreter_->tensor(id);
+void TfLiteDriver::SetExpectation(const string& name,
+                                  const string& csv_values) {
+  auto id = signature_outputs_[name];
+  auto* tensor = signature_runner_->output_tensor(name.c_str());
   if (expected_output_.count(id) != 0) {
     Invalidate(absl::StrCat("Overridden expectation for tensor '", id, "'"));
   }
@@ -741,8 +753,9 @@ void TfLiteDriver::SetExpectation(int id, const string& csv_values) {
   }
 }
 
-void TfLiteDriver::SetShapeExpectation(int id, const string& csv_values) {
-  if (!IsValid()) return;
+void TfLiteDriver::SetShapeExpectation(const string& name,
+                                       const string& csv_values) {
+  auto id = signature_outputs_[name];
   if (expected_output_shape_.count(id) != 0) {
     Invalidate(
         absl::StrCat("Overridden shape expectation for tensor '", id, "'"));
@@ -750,59 +763,10 @@ void TfLiteDriver::SetShapeExpectation(int id, const string& csv_values) {
   expected_output_shape_[id].reset(new ShapeExpectation(csv_values));
 }
 
-void TfLiteDriver::Invoke() {
-  if (!IsValid()) return;
-  if (interpreter_->Invoke() != kTfLiteOk) {
-    Invalidate("Failed to invoke interpreter");
-  }
-}
-
-bool TfLiteDriver::CheckResults() {
-  if (!IsValid()) return false;
-  bool success = true;
-  for (const auto& p : expected_output_) {
-    int id = p.first;
-    auto* tensor = interpreter_->tensor(id);
-    if (!p.second->Check(/*verbose=*/false, *tensor)) {
-      // Do not invalidate anything here. Instead, simply output the
-      // differences and return false. Invalidating would prevent all
-      // subsequent invocations from running..
-      std::cerr << "TfLiteDriver: There were errors in invocation '"
-                << GetInvocationId() << "', validating output tensor '" << id
-                << "':" << std::endl;
-      p.second->Check(/*verbose=*/true, *tensor);
-      success = false;
-      SetOverallSuccess(false);
-    }
-  }
-  for (const auto& p : expected_output_shape_) {
-    int id = p.first;
-    auto* tensor = interpreter_->tensor(id);
-    if (!p.second->CheckShape(/*verbose=*/false, *tensor)) {
-      // Do not invalidate anything here. Instead, simply output the
-      // differences and return false. Invalidating would prevent all
-      // subsequent invocations from running..
-      std::cerr << "TfLiteDriver: There were errors in invocation '"
-                << GetInvocationId()
-                << "', validating the shape of output tensor '" << id
-                << "':" << std::endl;
-      p.second->CheckShape(/*verbose=*/true, *tensor);
-      success = false;
-      SetOverallSuccess(false);
-    }
-  }
-  expected_output_.clear();
-  return success;
-}
-
 void TfLiteDriver::ResetLSTMStateTensors() {
   interpreter_->ResetVariableTensors();
 }
 
-string TfLiteDriver::ReadOutput(int id) {
-  return TensorValueToCsvString(interpreter_->tensor(id));
-}
-
 string TfLiteDriver::TensorValueToCsvString(const TfLiteTensor* tensor) {
   int num_elements = 1;
 
diff --git a/tensorflow/lite/testing/tflite_driver.h b/tensorflow/lite/testing/tflite_driver.h
index 0ed7a1db603082..862bec5645feb6 100644
--- a/tensorflow/lite/testing/tflite_driver.h
+++ b/tensorflow/lite/testing/tflite_driver.h
@@ -67,18 +67,9 @@ class TfLiteDriver : public TestRunner {
       const std::vector<std::pair<string, string>>& expected_outputs,
       const std::vector<std::pair<string, string>>& expected_output_shapes)
       override;
+  std::vector<string> GetOutputNames() override;
 
-  const std::vector<int>& GetInputs() override { return inputs_; }
-  const std::vector<int>& GetOutputs() override { return outputs_; }
-  void ReshapeTensor(int id, const string& csv_values) override;
   void AllocateTensors() override;
-  void ResetTensor(int id) override;
-  void SetInput(int id, const string& csv_values) override;
-  void SetExpectation(int id, const string& csv_values) override;
-  void SetShapeExpectation(int id, const string& csv_values) override;
-  void Invoke() override;
-  bool CheckResults() override;
-  string ReadOutput(int id) override;
   void SetThreshold(double relative_threshold, double absolute_threshold);
   void SetQuantizationErrorMultiplier(int quantization_error_multiplier);
 
@@ -86,6 +77,9 @@ class TfLiteDriver : public TestRunner {
   Interpreter::TfLiteDelegatePtr delegate_;
 
  private:
+  void SetInput(const string& name, const string& csv_values);
+  void SetExpectation(const string& name, const string& csv_values);
+  void SetShapeExpectation(const string& name, const string& csv_values);
   void DeallocateStringTensor(TfLiteTensor* t) {
     if (t) {
       free(t->data.raw);
@@ -105,8 +99,6 @@ class TfLiteDriver : public TestRunner {
   class DataExpectation;
   class ShapeExpectation;
 
-  std::vector<int> inputs_;
-  std::vector<int> outputs_;
   std::map<string, uint32_t> signature_inputs_;
   std::map<string, uint32_t> signature_outputs_;
   std::unique_ptr<OpResolver> resolver_;
diff --git a/tensorflow/lite/testing/tflite_driver_test.cc b/tensorflow/lite/testing/tflite_driver_test.cc
index 939fc09572c876..8aebfde2564004 100644
--- a/tensorflow/lite/testing/tflite_driver_test.cc
+++ b/tensorflow/lite/testing/tflite_driver_test.cc
@@ -24,47 +24,14 @@ namespace {
 using ::testing::ElementsAre;
 
 TEST(TfliteDriverTest, SimpleTest) {
-  std::unique_ptr<TestRunner> runner(new TfLiteDriver());
-
-  runner->SetModelBaseDir("tensorflow/lite");
-  runner->LoadModel("testdata/multi_add.bin");
-  ASSERT_TRUE(runner->IsValid());
-
-  ASSERT_THAT(runner->GetInputs(), ElementsAre(0, 1, 2, 3));
-  ASSERT_THAT(runner->GetOutputs(), ElementsAre(5, 6));
-
-  for (int i : {0, 1, 2, 3}) {
-    runner->ReshapeTensor(i, "1,2,2,1");
-  }
-  ASSERT_TRUE(runner->IsValid());
-
-  runner->AllocateTensors();
-
-  runner->SetInput(0, "0.1,0.2,0.3,0.4");
-  runner->SetInput(1, "0.001,0.002,0.003,0.004");
-  runner->SetInput(2, "0.001,0.002,0.003,0.004");
-  runner->SetInput(3, "0.01,0.02,0.03,0.04");
-
-  runner->ResetTensor(2);
-
-  runner->SetExpectation(5, "0.101,0.202,0.303,0.404");
-  runner->SetExpectation(6, "0.011,0.022,0.033,0.044");
-
-  runner->Invoke();
-  ASSERT_TRUE(runner->IsValid());
-
-  ASSERT_TRUE(runner->CheckResults());
-  EXPECT_EQ(runner->ReadOutput(5), "0.101,0.202,0.303,0.404");
-  EXPECT_EQ(runner->ReadOutput(6), "0.011,0.022,0.033,0.044");
-}
-
-TEST(TfliteDriverTest, SimpleTestWithSignature) {
   std::unique_ptr<TestRunner> runner(new TfLiteDriver);
 
   runner->SetModelBaseDir("tensorflow/lite");
-  runner->LoadModel("testdata/multi_add_signature.bin", "serving_default");
+  runner->LoadModel("testdata/multi_add.bin", "serving_default");
   ASSERT_TRUE(runner->IsValid());
 
+  ASSERT_THAT(runner->GetOutputNames(), ElementsAre("x", "y"));
+
   for (const auto& i : {"a", "b", "c", "d"}) {
     runner->ReshapeTensor(i, "1,2,2,1");
   }
@@ -96,32 +63,24 @@ TEST(TfliteDriverTest, SingleAddOpTest) {
   runner->LoadModel("testdata/multi_add.bin");
   ASSERT_TRUE(runner->IsValid());
 
-  ASSERT_THAT(runner->GetInputs(), ElementsAre(0, 1, 2, 3));
-  ASSERT_THAT(runner->GetOutputs(), ElementsAre(5, 6));
-
-  for (int i : {0, 1, 2, 3}) {
+  for (const auto& i : {"a", "b", "c", "d"}) {
     runner->ReshapeTensor(i, "1,2,2,1");
   }
   ASSERT_TRUE(runner->IsValid());
 
   runner->AllocateTensors();
 
-  runner->SetInput(0, "0.1,0.2,0.3,0.4");
-  runner->SetInput(1, "0.001,0.002,0.003,0.004");
-  runner->SetInput(2, "0.001,0.002,0.003,0.004");
-  runner->SetInput(3, "0.01,0.02,0.03,0.04");
-
-  runner->ResetTensor(2);
-
-  runner->SetExpectation(5, "0.101,0.202,0.303,0.404");
-  runner->SetExpectation(6, "0.011,0.022,0.033,0.044");
-
-  runner->Invoke();
+  runner->ResetTensor("c");
+  runner->Invoke({{"a", "0.1,0.2,0.3,0.4"},
+                  {"b", "0.001,0.002,0.003,0.004"},
+                  {"d", "0.01,0.02,0.03,0.04"}});
   ASSERT_TRUE(runner->IsValid());
 
-  ASSERT_TRUE(runner->CheckResults());
-  EXPECT_EQ(runner->ReadOutput(5), "0.101,0.202,0.303,0.404");
-  EXPECT_EQ(runner->ReadOutput(6), "0.011,0.022,0.033,0.044");
+  ASSERT_TRUE(runner->CheckResults(
+      {{"x", "0.101,0.202,0.303,0.404"}, {"y", "0.011,0.022,0.033,0.044"}},
+      /*expected_output_shapes=*/{}));
+  EXPECT_EQ(runner->ReadOutput("x"), "0.101,0.202,0.303,0.404");
+  EXPECT_EQ(runner->ReadOutput("y"), "0.011,0.022,0.033,0.044");
 }
 
 TEST(TfliteDriverTest, AddOpWithNaNTest) {
@@ -133,32 +92,25 @@ TEST(TfliteDriverTest, AddOpWithNaNTest) {
   runner->LoadModel("testdata/multi_add.bin");
   ASSERT_TRUE(runner->IsValid());
 
-  ASSERT_THAT(runner->GetInputs(), ElementsAre(0, 1, 2, 3));
-  ASSERT_THAT(runner->GetOutputs(), ElementsAre(5, 6));
-
-  for (int i : {0, 1, 2, 3}) {
+  for (const auto& i : {"a", "b", "c", "d"}) {
     runner->ReshapeTensor(i, "1,2,2,1");
   }
+
   ASSERT_TRUE(runner->IsValid());
 
   runner->AllocateTensors();
 
-  runner->SetInput(0, "0.1,nan,0.3,0.4");
-  runner->SetInput(1, "0.001,0.002,0.003,0.004");
-  runner->SetInput(2, "0.001,0.002,0.003,0.004");
-  runner->SetInput(3, "0.01,0.02,0.03,nan");
-
-  runner->ResetTensor(2);
-
-  runner->SetExpectation(5, "0.101,nan,0.303,0.404");
-  runner->SetExpectation(6, "0.011,0.022,0.033,nan");
-
-  runner->Invoke();
+  runner->ResetTensor("c");
+  runner->Invoke({{"a", "0.1,nan,0.3,0.4"},
+                  {"b", "0.001,0.002,0.003,0.004"},
+                  {"d", "0.01,0.02,0.03,nan"}});
   ASSERT_TRUE(runner->IsValid());
 
-  ASSERT_TRUE(runner->CheckResults());
-  EXPECT_EQ(runner->ReadOutput(5), "0.101,nan,0.303,0.404");
-  EXPECT_EQ(runner->ReadOutput(6), "0.011,0.022,0.033,nan");
+  ASSERT_TRUE(runner->CheckResults(
+      {{"x", "0.101,nan,0.303,0.404"}, {"y", "0.011,0.022,0.033,nan"}},
+      /*expected_output_shapes=*/{}));
+  EXPECT_EQ(runner->ReadOutput("x"), "0.101,nan,0.303,0.404");
+  EXPECT_EQ(runner->ReadOutput("y"), "0.011,0.022,0.033,nan");
 }
 
 TEST(TfliteDriverTest, AddQuantizedInt8Test) {
@@ -168,23 +120,16 @@ TEST(TfliteDriverTest, AddQuantizedInt8Test) {
   runner->LoadModel("testdata/add_quantized_int8.bin");
   ASSERT_TRUE(runner->IsValid());
 
-  ASSERT_THAT(runner->GetInputs(), ElementsAre(1));
-  ASSERT_THAT(runner->GetOutputs(), ElementsAre(2));
-
-  runner->ReshapeTensor(1, "1,2,2,1");
+  runner->ReshapeTensor("a", "1,2,2,1");
   ASSERT_TRUE(runner->IsValid());
 
   runner->AllocateTensors();
 
-  runner->SetInput(1, "1,1,1,1");
-
-  runner->SetExpectation(2, "0.0117,0.0117,0.0117,0.0117");
-
-  runner->Invoke();
+  runner->Invoke({{"a", "1,1,1,1"}});
   ASSERT_TRUE(runner->IsValid());
 
-  ASSERT_TRUE(runner->CheckResults());
-  EXPECT_EQ(runner->ReadOutput(2), "3,3,3,3");
+  ASSERT_TRUE(runner->CheckResults({{"x", "0.0117,0.0117,0.0117,0.0117"}}, {}));
+  EXPECT_EQ(runner->ReadOutput("x"), "3,3,3,3");
 }
 
 }  // namespace
diff --git a/tensorflow/lite/testing/zip_test_utils.py b/tensorflow/lite/testing/zip_test_utils.py
index 9020addb99d9db..67026d40b77712 100644
--- a/tensorflow/lite/testing/zip_test_utils.py
+++ b/tensorflow/lite/testing/zip_test_utils.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 # ==============================================================================
 """Utils for make_zip tests."""
-import collections
 import functools
 import itertools
 import operator
@@ -79,6 +78,7 @@ def get_test_function(test_function_name):
     tf.int32: (np.int32, "INT32"),
     tf.uint32: (np.uint32, "UINT32"),
     tf.uint8: (np.uint8, "QUANTIZED_UINT8"),
+    tf.int8: (np.int8, "INT8"),
     tf.int16: (np.int16, "QUANTIZED_INT16"),
     tf.int64: (np.int64, "INT64"),
     tf.bool: (np.bool_, "BOOL"),
@@ -116,7 +116,7 @@ def create_tensor_data(dtype, shape, min_value=-100, max_value=100):
     real = (max_value - min_value) * np.random.random_sample(shape) + min_value
     imag = (max_value - min_value) * np.random.random_sample(shape) + min_value
     value = real + imag * 1j
-  elif dtype in (tf.uint32, tf.int32, tf.uint8, tf.int64, tf.int16):
+  elif dtype in (tf.uint32, tf.int32, tf.uint8, tf.int8, tf.int64, tf.int16):
     value = np.random.randint(min_value, max_value + 1, shape)
   elif dtype == tf.bool:
     value = np.random.choice([True, False], size=shape)
@@ -201,6 +201,36 @@ def write_tensor(fp, name, x):
       write_tensor(fp, name, value)
 
 
+class TextFormatWriter(object):
+  """Utility class for writing ProtoBuf like messages."""
+
+  def __init__(self, fp, name=None, parent=None):
+    self.fp = fp
+    self.indent = parent.indent if parent else 0
+    self.name = name
+
+  def __enter__(self):
+    if self.name:
+      self.write(self.name + " {")
+      self.indent += 2
+    return self
+
+  def __exit__(self, *exc_info):
+    if self.name:
+      self.indent -= 2
+      self.write("}")
+    return True
+
+  def write(self, data):
+    self.fp.write(" " * self.indent + data + "\n")
+
+  def write_field(self, key, val):
+    self.write(key + ": \"" + val + "\"")
+
+  def sub_message(self, name):
+    return TextFormatWriter(self.fp, name, self)
+
+
 def write_test_cases(fp, model_name, examples):
   """Given a dictionary of `examples`, write a text format representation.
 
@@ -211,25 +241,47 @@ def write_test_cases(fp, model_name, examples):
     fp: File-like object to write to.
     model_name: Filename where the model was written to, relative to filename.
     examples: Example dictionary consisting of keys "inputs" and "outputs"
+
+  Raises:
+    RuntimeError: Example dictionary does not have input / output names.
   """
 
-  fp.write("load_model: %s\n" % os.path.basename(model_name))
+  writer = TextFormatWriter(fp)
+  writer.write_field("load_model", os.path.basename(model_name))
   for example in examples:
-    fp.write("reshape {\n")
-    for _, value in example["inputs"].items():
-      if value is not None:
-        fp.write("  input: \"" + ",".join(map(str, value.shape)) + "\"\n")
-    fp.write("}\n")
-
-    fp.write("invoke {\n")
-    for _, value in example["inputs"].items():
-      if value is not None:
-        fp.write("  input: \"" + format_result(value) + "\"\n")
-    for _, value in example["outputs"].items():
-      fp.write("  output: \"" + format_result(value) + "\"\n")
-      fp.write("  output_shape: \"" +
-               ",".join([str(dim) for dim in value.shape]) + "\"\n")
-    fp.write("}\n")
+    inputs = []
+    for name in example["inputs"].keys():
+      if name:
+        inputs.append(name)
+    outputs = []
+    for name in example["outputs"].keys():
+      if name:
+        outputs.append(name)
+    if not (inputs and outputs):
+      raise RuntimeError("Empty input / output names.")
+
+    # Reshape message
+    with writer.sub_message("reshape") as reshape:
+      for name, value in example["inputs"].items():
+        with reshape.sub_message("input") as input_msg:
+          input_msg.write_field("key", name)
+          input_msg.write_field("value", ",".join(map(str, value.shape)))
+
+    # Invoke message
+    with writer.sub_message("invoke") as invoke:
+      for name, value in example["inputs"].items():
+        with invoke.sub_message("input") as input_msg:
+          input_msg.write_field("key", name)
+          input_msg.write_field("value", format_result(value))
+      # Expectations
+      for name, value in example["outputs"].items():
+        with invoke.sub_message("output") as output_msg:
+          output_msg.write_field("key", name)
+          output_msg.write_field("value", format_result(value))
+        with invoke.sub_message("output_shape") as output_shape:
+          output_shape.write_field("key", name)
+          output_shape.write_field("value",
+                                   ",".join([str(dim) for dim in value.shape]))
 
 
 def get_input_shapes_map(input_tensors):
@@ -508,14 +560,6 @@ def build_example(label, param_dict_real, zip_path_label):
           report["tflite_converter"] = report_lib.FAILED
           report["tf"] = report_lib.SUCCESS
 
-          # Sorts the lists to make the order of input/output the same as order
-          # of the signature names.
-          # TODO(b/192473002): Remove sorting after TFLiteDriver can run with
-          # signatures.
-          inputs = sorted(inputs, key=lambda x: _normalize_input_name(x.name))
-          outputs = sorted(
-              outputs, key=lambda x: _normalize_output_name(x.name))
-
           # Builds a saved model with the default signature key.
           input_names, tensor_info_inputs = _get_tensor_info(
               inputs, "input_", _normalize_input_name)
@@ -576,12 +620,6 @@ def build_example(label, param_dict_real, zip_path_label):
           zipinfo = zipfile.ZipInfo(zip_path_label + ".bin")
           archive.writestr(zipinfo, tflite_model_binary, zipfile.ZIP_DEFLATED)
 
-          # TODO(b/192473002): Remove sorting after TFLiteDriver can run with
-          # signatures.
-          baseline_input_map = collections.OrderedDict(
-              sorted(baseline_input_map.items()))
-          baseline_output_map = collections.OrderedDict(
-              sorted(baseline_output_map.items()))
           example = {
               "inputs": baseline_input_map,
               "outputs": baseline_output_map
diff --git a/tensorflow/lite/tools/cmake/modules/Findclog.cmake b/tensorflow/lite/tools/cmake/modules/Findclog.cmake
new file mode 100644
index 00000000000000..6ed7e9ad6d29bc
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findclog.cmake
@@ -0,0 +1,16 @@
+#
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(clog)
diff --git a/tensorflow/lite/tools/cmake/modules/Findcpuinfo.cmake b/tensorflow/lite/tools/cmake/modules/Findcpuinfo.cmake
new file mode 100644
index 00000000000000..65d8cf25789e47
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/Findcpuinfo.cmake
@@ -0,0 +1,16 @@
+#
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(cpuinfo)
diff --git a/tensorflow/lite/tools/cmake/modules/clog.cmake b/tensorflow/lite/tools/cmake/modules/clog.cmake
new file mode 100644
index 00000000000000..93f10fb73c515f
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/clog.cmake
@@ -0,0 +1,42 @@
+#
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(TARGET clog OR clog_POPULATED)
+  return()
+endif()
+
+include(OverridableFetchContent)
+
+OverridableFetchContent_Declare(
+  clog
+  GIT_REPOSITORY https://github.com/pytorch/cpuinfo
+  # Sync with tensorflow/third_party/clog/workspace.bzl
+  GIT_TAG d5e37adf1406cf899d7d9ec1d317c47506ccb970
+  GIT_PROGRESS TRUE
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/clog"
+)
+OverridableFetchContent_GetProperties(clog)
+if(NOT clog_POPULATED)
+  OverridableFetchContent_Populate(clog)
+endif()
+
+set(CLOG_SOURCE_DIR "${clog_SOURCE_DIR}" CACHE PATH "CLOG source directory")
+set(CLOG_BUILD_TESTS OFF CACHE BOOL "Disable CLOG tests")
+
+add_subdirectory(
+  "${clog_SOURCE_DIR}/deps/clog"
+  "${clog_BINARY_DIR}"
+  EXCLUDE_FROM_ALL
+)
diff --git a/tensorflow/lite/tools/cmake/modules/cpuinfo.cmake b/tensorflow/lite/tools/cmake/modules/cpuinfo.cmake
new file mode 100644
index 00000000000000..71f7f933828048
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/cpuinfo.cmake
@@ -0,0 +1,45 @@
+#
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(TARGET cpuinfo OR cpuinfo_POPULATED)
+  return()
+endif()
+
+include(OverridableFetchContent)
+
+OverridableFetchContent_Declare(
+  cpuinfo
+  GIT_REPOSITORY https://github.com/pytorch/cpuinfo
+  # Sync with tensorflow/third_party/cpuinfo/workspace.bzl
+  GIT_TAG 5916273f79a21551890fd3d56fc5375a78d1598d
+  GIT_PROGRESS TRUE
+  SOURCE_DIR "${CMAKE_BINARY_DIR}/cpuinfo"
+)
+OverridableFetchContent_GetProperties(cpuinfo)
+if(NOT cpuinfo_POPULATED)
+  OverridableFetchContent_Populate(cpuinfo)
+endif()
+
+set(CPUINFO_SOURCE_DIR "${cpuinfo_SOURCE_DIR}" CACHE PATH "CPUINFO source directory")
+set(CPUINFO_BUILD_TOOLS OFF CACHE BOOL "Disable cpuinfo command-line tools")
+set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE BOOL "Disable cpuinfo unit tests")
+set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE BOOL "Disable cpuinfo cpuinfo mock tests")
+set(CPUINFO_BUILD_BENCHMARKS OFF CACHE BOOL "Disable cpuinfo micro-benchmarks")
+
+add_subdirectory(
+  "${cpuinfo_SOURCE_DIR}"
+  "${cpuinfo_BINARY_DIR}"
+  EXCLUDE_FROM_ALL
+)
diff --git a/tensorflow/lite/tools/cmake/modules/eigen.cmake b/tensorflow/lite/tools/cmake/modules/eigen.cmake
index 505b73fd68cbb0..acd06abe9d1d40 100644
--- a/tensorflow/lite/tools/cmake/modules/eigen.cmake
+++ b/tensorflow/lite/tools/cmake/modules/eigen.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   eigen
   GIT_REPOSITORY https://gitlab.com/libeigen/eigen
   # Sync with tensorflow/third_party/eigen3/workspace.bzl
-  GIT_TAG cfdb3ce3f018166a2cb0bfa8b18599c914bf447e
+  GIT_TAG 085c2fc5d53f391afcccce21c45e15f61c827ab1
   # It's not currently (cmake 3.17) possible to shallow clone with a GIT TAG
   # as cmake attempts to git checkout the commit hash after the clone
   # which doesn't work as it's a shallow clone hence a different commit hash.
diff --git a/tensorflow/lite/tools/cmake/modules/ruy.cmake b/tensorflow/lite/tools/cmake/modules/ruy.cmake
index 17e4bbefd4a37f..752f527cf1b714 100644
--- a/tensorflow/lite/tools/cmake/modules/ruy.cmake
+++ b/tensorflow/lite/tools/cmake/modules/ruy.cmake
@@ -35,7 +35,7 @@ endif()
 set(RUY_SOURCE_DIR "${ruy_SOURCE_DIR}" CACHE PATH "RUY source directory")
 
 add_subdirectory(
-  "${CMAKE_CURRENT_LIST_DIR}/ruy"
+  "${ruy_SOURCE_DIR}"
   "${ruy_BINARY_DIR}"
   EXCLUDE_FROM_ALL
 )
diff --git a/tensorflow/lite/tools/cmake/modules/ruy/CMakeLists.txt b/tensorflow/lite/tools/cmake/modules/ruy/CMakeLists.txt
deleted file mode 100644
index a117f4afed8cc1..00000000000000
--- a/tensorflow/lite/tools/cmake/modules/ruy/CMakeLists.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-#
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-cmake_minimum_required(VERSION 3.16)
-
-project(ruy CXX)
-
-set(CMAKE_CXX_STANDARD 14)  # Some components require C++14.
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-set(RUY_SOURCE_DIR "" CACHE PATH
-  "Directory that contains the RUY project"
-)
-if(NOT RUY_SOURCE_DIR)
-  message(FATAL_ERROR "Must specify source directory")
-endif()
-
-file(GLOB RUY_SOURCES "${RUY_SOURCE_DIR}/ruy/*.*")
-list(FILTER RUY_SOURCES INCLUDE REGEX ".*\\.(c|cc|h)$")
-list(FILTER RUY_SOURCES EXCLUDE REGEX ".*(test)\\.(c|cc|h)$")
-list(FILTER RUY_SOURCES EXCLUDE REGEX ".*/(benchmark|example|test_.*)\.cc$")
-list(FILTER RUY_SOURCES EXCLUDE REGEX ".*/gtest_wrapper\\.h$")
-
-set_source_files_properties(${RUY_SOURCES}  PROPERTIES LANGUAGE CXX)
-
-add_library(ruy ${RUY_SOURCES})
-target_include_directories(ruy PUBLIC "${RUY_SOURCE_DIR}")
-
diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index 1852a6f580c7e5..27341b9b2c8213 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG b4cde5aebb7676fc85825dab737a6d0dc60a0e23
+  GIT_TAG 0d6a1194ff36f59c79089336d7a3b446c3c6a39d
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/lite/tools/delegates/README.md b/tensorflow/lite/tools/delegates/README.md
index a9ea6e3d3470e5..dc5bd540e653c4 100644
--- a/tensorflow/lite/tools/delegates/README.md
+++ b/tensorflow/lite/tools/delegates/README.md
@@ -32,11 +32,11 @@ TFLite delegate.
     choice of each delegate. \
     This option is currently supported by the Hexagon and CoreML delegate.
 *   `delegate_serialize_dir`: `string` (default="") \
-    Directory to be used by delegates for serializing any model data.
-    This allows the delegate to save data into this directory to reduce init
-    time after the first run. Currently supported by NNAPI delegate with
-    specific backends on Android. Note that delegate_serialize_token is also
-    required to enable this feature.
+    Directory to be used by delegates for serializing any model data. This
+    allows the delegate to save data into this directory to reduce init time
+    after the first run. Currently supported by GPU (OpenCL) and NNAPI delegate
+    with specific backends on Android. Note that delegate_serialize_token is
+    also required to enable this feature.
 *   `delegate_serialize_token`: `string` (default="") \
     Model-specific token acting as a namespace for delegate serialization.
     Unique tokens ensure that the delegate doesn't read inapplicable/invalid
diff --git a/tensorflow/lite/tools/delegates/hexagon_delegate_provider.cc b/tensorflow/lite/tools/delegates/hexagon_delegate_provider.cc
index 2124bec2447c88..5fa1324374ecc3 100644
--- a/tensorflow/lite/tools/delegates/hexagon_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/hexagon_delegate_provider.cc
@@ -18,10 +18,6 @@ limitations under the License.
 #include "tensorflow/lite/tools/delegates/delegate_provider.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
 
-#if !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__))
-#define TFLITE_ENABLE_HEXAGON
-#endif
-
 #if defined(TFLITE_ENABLE_HEXAGON)
 #include "tensorflow/lite/delegates/hexagon/hexagon_delegate.h"
 #endif
diff --git a/tensorflow/lite/tools/evaluation/utils.cc b/tensorflow/lite/tools/evaluation/utils.cc
index 855f048c782c1a..11e066dc59bf1a 100644
--- a/tensorflow/lite/tools/evaluation/utils.cc
+++ b/tensorflow/lite/tools/evaluation/utils.cc
@@ -137,16 +137,16 @@ TfLiteDelegatePtr CreateGPUDelegate() {
 
 TfLiteDelegatePtr CreateHexagonDelegate(
     const std::string& library_directory_path, bool profiling) {
-#if !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__))
+#if TFLITE_ENABLE_HEXAGON
   TfLiteHexagonDelegateOptions options = {0};
   options.print_graph_profile = profiling;
   return CreateHexagonDelegate(&options, library_directory_path);
 #else
   return CreateNullDelegate();
-#endif  // defined(__arm__)
+#endif  // TFLITE_ENABLE_HEXAGON
 }
 
-#if !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__))
+#if TFLITE_ENABLE_HEXAGON
 TfLiteDelegatePtr CreateHexagonDelegate(
     const TfLiteHexagonDelegateOptions* options,
     const std::string& library_directory_path) {
diff --git a/tensorflow/lite/tools/evaluation/utils.h b/tensorflow/lite/tools/evaluation/utils.h
index 190c9e3f203bdd..18590efc54d5aa 100644
--- a/tensorflow/lite/tools/evaluation/utils.h
+++ b/tensorflow/lite/tools/evaluation/utils.h
@@ -31,7 +31,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/delegate.h"
 #endif
 
-#if !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__))
+#if TFLITE_ENABLE_HEXAGON
 #include "tensorflow/lite/delegates/hexagon/hexagon_delegate.h"
 #endif
 
@@ -77,7 +77,7 @@ TfLiteDelegatePtr CreateGPUDelegate(TfLiteGpuDelegateOptionsV2* options);
 
 TfLiteDelegatePtr CreateHexagonDelegate(
     const std::string& library_directory_path, bool profiling);
-#if !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__))
+#if TFLITE_ENABLE_HEXAGON
 TfLiteDelegatePtr CreateHexagonDelegate(
     const TfLiteHexagonDelegateOptions* options,
     const std::string& library_directory_path);
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index babe9b2f07c39a..f871f5de87f032 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -802,8 +802,11 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 2;
     case BuiltinOperator_CAST:
-      if (op_sig.inputs.at(0).type == kTfLiteUInt32 ||
-          op_sig.outputs.at(0).type == kTfLiteUInt32) {
+      if (op_sig.inputs.at(0).type == kTfLiteInt8 ||
+          op_sig.outputs.at(0).type == kTfLiteInt8) {
+        return 3;
+      } else if (op_sig.inputs.at(0).type == kTfLiteUInt32 ||
+                 op_sig.outputs.at(0).type == kTfLiteUInt32) {
         return 2;
       }
       return 1;
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index a43e6bce22d94b..1a8b2b5c53b1a7 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -95,6 +95,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
               {{BuiltinOperator_BATCH_TO_SPACE_ND, 3}, "2.3.0"},
               {{BuiltinOperator_CAST, 1}, "1.5.0"},
               {{BuiltinOperator_CAST, 2}, "2.7.0"},
+              {{BuiltinOperator_CAST, 3}, "2.8.0"},
               {{BuiltinOperator_CONCATENATION, 1}, "1.5.0"},
               {{BuiltinOperator_CONCATENATION, 2}, "1.14.0"},
               {{BuiltinOperator_CONCATENATION, 3}, "2.3.0"},
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 3ab58977fed736..a01fdb7565647d 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -117,7 +117,6 @@ third_party/clang_toolchain/download_clang.bzl
 third_party/codegen.BUILD
 third_party/common.bzl
 third_party/compute_library/BUILD
-third_party/compute_library/LICENSE
 third_party/coremltools.BUILD
 third_party/cub.BUILD
 third_party/curl.BUILD
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 7ed75141bb5ffe..4dfdf0da6faccb 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2489,7 +2489,6 @@ py_library(
         "//tensorflow/python/ops/linalg/sparse",
         "//tensorflow/python/ops/ragged",
         "//tensorflow/python/ops/structured",
-        "//tensorflow/python/training/experimental:loss_scaling_gradient_tape",
         "//tensorflow/python/util",
     ],
 )
@@ -3006,9 +3005,9 @@ cuda_py_test(
     size = "medium",
     srcs = ["ops/nn_fused_batchnorm_deterministic_test.py"],
     python_version = "PY3",
+    shard_count = 4,
     tags = [
         "no_rocm",
-        "notsan",  # TODO(b/206640843): flaky in tsan.
     ],
     deps = [
         ":client_testlib",
@@ -4499,11 +4498,6 @@ alias(
     actual = "//tensorflow/python/training/experimental:mixed_precision_global_state",
 )
 
-alias(
-    name = "loss_scaling_gradient_tape",
-    actual = "//tensorflow/python/training/experimental:loss_scaling_gradient_tape",
-)
-
 alias(
     name = "tf_optimizer",
     actual = "//tensorflow/python/grappler:tf_optimizer",
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index e5dee13e8909c8..17e7a8614eb11b 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2021, 11, 29)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2021, 12, 6)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert_test.py b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
index 8592ce1c73017b..4e358bbd173a5e 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert_test.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert_test.py
@@ -27,7 +27,6 @@
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.compiler.tensorrt import trt_convert
-from tensorflow.python.compiler.tensorrt import utils as trt_utils
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -39,7 +38,7 @@
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_resource_variable_ops
-from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import builder
@@ -481,7 +480,7 @@ def _InputFn():
 
   @test_util.run_v2_only
   def testTrtGraphConverter_ShapeOp_Int32InputOutput_v2(self):
-    """Testing ShapeOp and int32 values as engine input and outpu."""
+    """Testing ShapeOp and int32 values as engine input and output."""
 
     class ShapeOpModel(tracking.AutoTrackable):
 
@@ -497,8 +496,7 @@ def run(self, x):
         # Add an OP that is not supported by TF-TRT. This allows TF-TRT to build
         # two engines. The first engine produces an int32 output and the second
         # engines has an int32 input and an int32 output.
-        q = nn_ops.data_format_vec_permute(
-            q_shape, src_format="NHWC", dst_format="NCHW")
+        q = math_ops.cumsum(q_shape)
         q = q * 2
         return array_ops.identity(q, name="output")
 
@@ -880,14 +878,6 @@ def testTrtGraphConverter_StaticOp(self):
 
   @test_util.run_v2_only
   def testTrtGraphConverter_AllowEngineNativeSegmentExecution(self):
-
-    # This test will not work anymore with TRT >= 8. TensorRT does not
-    # preallocate anymore the max_workspace_size_bytes, but rather allocates as
-    # it needs up to this value.
-    # TODO: update the unittest to make this TRTEngine creation fail with TRT8.
-    if trt_utils.is_linked_tensorrt_version_greater_equal(8, 0, 0):
-      return
-
     np_input1, np_input2 = self._RandomInput([4, 1, 1])
 
     # Create a model and save it.
@@ -899,12 +889,13 @@ def testTrtGraphConverter_AllowEngineNativeSegmentExecution(self):
     def _InputFn():
       yield np_input1, np_input2
 
-    # Run TRT conversion and request an unreasonably large workspace.
+    # Run TRT conversion
     converter = self._CreateConverterV2(
-        input_saved_model_dir, max_workspace_size_bytes=10 << 40)
+        input_saved_model_dir, max_workspace_size_bytes=1 << 20)
     converter.convert()
 
     os.environ["TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION"] = "False"
+    os.environ["TF_TRT_ABORT_CUDA_ENGINE_BUILD"] = "True"
     with self.assertRaisesRegex(
         errors.AbortedError,
         r"User disallowed engine native segment execution"):
@@ -913,6 +904,7 @@ def _InputFn():
       finally:
         # Always reset the environment variable.
         os.environ["TF_TRT_ALLOW_ENGINE_NATIVE_SEGMENT_EXECUTION"] = "True"
+        os.environ["TF_TRT_ABORT_CUDA_ENGINE_BUILD"] = "False"
 
     converter.build(input_fn=_InputFn)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/BUILD b/tensorflow/python/data/experimental/kernel_tests/service/BUILD
index 3677b5e0725802..614c0f1f7e061e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/service/BUILD
@@ -80,7 +80,6 @@ tf_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
-        "//tensorflow/python:script_ops",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_array_ops",
@@ -152,6 +151,27 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "metadata_test",
+    size = "medium",
+    srcs = ["metadata_test.py"],
+    shard_count = 2,
+    deps = [
+        ":test_base",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python/compat",
+        "//tensorflow/python/data/experimental/ops:data_service_ops",
+        "//tensorflow/python/data/experimental/ops:distribute",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:def_function",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "multi_device_test",
     size = "small",
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/data_service_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/service/data_service_ops_test.py
index ac0121340e5ccc..866b8b4a9532bf 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/data_service_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/service/data_service_ops_test.py
@@ -14,12 +14,10 @@
 # ==============================================================================
 """Tests for tf.data service ops."""
 import time
-from unittest import mock
 
 from absl.testing import parameterized
 
 from tensorflow.core.protobuf import service_config_pb2
-from tensorflow.python.compat import compat
 from tensorflow.python.data.experimental.kernel_tests.service import test_base as data_service_test_base
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import data_service_ops
@@ -38,12 +36,10 @@
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import tensor_array_ops
@@ -80,82 +76,6 @@ def testDistributeCompression(self, compression):
         num_elements, cluster, compression=compression)
     self.assertDatasetProduces(ds, list(range(num_elements)))
 
-  @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         combinations.combine(compression=[None, "AUTO"])))
-  def testFromDatasetIdOmitsCompression(self, compression):
-    cluster = data_service_test_base.TestCluster(
-        num_workers=1, data_transfer_protocol="grpc")
-    dataset = dataset_ops.Dataset.from_tensor_slices(
-        list("abcdefghijklmnopqrstuvwxyz"))
-    def to_upper(x):
-      return script_ops.numpy_function(
-          func=lambda x: x.decode("utf-8").upper(), inp=[x], Tout=dtypes.string)
-    dataset = dataset.map(to_upper, num_parallel_calls=dataset_ops.AUTOTUNE)
-    with mock.patch.object(compat, "forward_compatible", return_value=True):
-      dataset_id = data_service_ops.register_dataset(
-          cluster.dispatcher.target, dataset=dataset, compression=compression)
-      dataset = data_service_ops.from_dataset_id(
-          processing_mode=ShardingPolicy.OFF,
-          service=cluster.dispatcher.target,
-          dataset_id=dataset_id,
-          element_spec=dataset.element_spec)
-      self.assertDatasetProduces(dataset, list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
-
-  # Eager-only as querying `element_spec` is only supported in the eager mode.
-  @combinations.generate(
-      combinations.times(test_base.eager_only_combinations(),
-                         combinations.combine(compression=[None, "AUTO"])))
-  def testFromDatasetIdOmitsElementSpecAndCompression(self, compression):
-    cluster = data_service_test_base.TestCluster(
-        num_workers=1, data_transfer_protocol="grpc")
-    dataset = dataset_ops.Dataset.from_tensor_slices(
-        list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
-    with mock.patch.object(compat, "forward_compatible", return_value=True):
-      dataset_id = data_service_ops.register_dataset(
-          cluster.dispatcher.target, dataset=dataset, compression=compression)
-      dataset = data_service_ops.from_dataset_id(
-          processing_mode=ShardingPolicy.OFF,
-          service=cluster.dispatcher.target,
-          dataset_id=dataset_id)
-      self.assertDatasetProduces(dataset, list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
-
-  def _testCompressionMismatch(self, dataset):
-    cluster = data_service_test_base.TestCluster(
-        num_workers=1, data_transfer_protocol="grpc")
-    with mock.patch.object(compat, "forward_compatible", return_value=False):
-      dataset_id = data_service_ops._register_dataset(
-          cluster.dispatcher.target, dataset=dataset, compression=None)
-      # `compression` is "AUTO" by default.
-      dataset = data_service_ops._from_dataset_id(
-          processing_mode=ShardingPolicy.OFF,
-          service=cluster.dispatcher.target,
-          dataset_id=dataset_id,
-          element_spec=dataset.element_spec)
-      with self.assertRaises(errors.InvalidArgumentError):
-        self.getDatasetOutput(dataset)
-
-  @combinations.generate(
-      combinations.times(test_base.default_test_combinations()))
-  def testCompressionDtypeMismatch(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices(
-        list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
-    self._testCompressionMismatch(dataset)
-
-  @combinations.generate(
-      combinations.times(test_base.default_test_combinations()))
-  def testCompressionShapeMismatch(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([[1, 2], [3, 4]])
-    self._testCompressionMismatch(dataset)
-
-  # Only test eager mode since nested datasets are not allowed in graph mode.
-  @combinations.generate(
-      combinations.times(test_base.eager_only_combinations()))
-  def testCompressionVariantMismatch(self):
-    # Use a nested dataset as an example of a variant.
-    dataset = dataset_ops.Dataset.from_tensors(dataset_ops.Dataset.range(10))
-    self._testCompressionMismatch(dataset)
-
   @combinations.generate(test_base.default_test_combinations())
   def testDistributeInvalidCompression(self):
     cluster = data_service_test_base.TestCluster(num_workers=1)
@@ -933,61 +853,6 @@ def testVariables(self, use_resource):
     self.assertDatasetProduces(
         ds, list(range(10, 13)), requires_initialization=True)
 
-  @combinations.generate(test_base.graph_only_combinations())
-  def testElementSpecGraphMode(self):
-    cluster = data_service_test_base.TestCluster(
-        num_workers=1, work_dir=NO_WORK_DIR, fault_tolerant_mode=False)
-    num_elements = 10
-    ds = dataset_ops.Dataset.range(num_elements)
-    dataset_id = data_service_ops.register_dataset(cluster.dispatcher_address(),
-                                                   ds)
-    with self.assertRaisesRegex(
-        ValueError, "In graph mode `element_spec` must be provided manually."):
-      ds = data_service_ops.from_dataset_id("parallel_epochs",
-                                            cluster.dispatcher_address(),
-                                            dataset_id)
-
-  @combinations.generate(test_base.eager_only_combinations())
-  def testFromDatasetIdDoesntRequireElementSpec(self):
-    cluster = data_service_test_base.TestCluster(
-        num_workers=1,
-        work_dir=NO_WORK_DIR,
-        fault_tolerant_mode=False,
-        data_transfer_protocol="grpc")
-    num_elements = 10
-    ds = dataset_ops.Dataset.range(num_elements)
-
-    dataset_id = data_service_ops.register_dataset(cluster.dispatcher_address(),
-                                                   ds)
-    ds = data_service_ops.from_dataset_id("parallel_epochs",
-                                          cluster.dispatcher_address(),
-                                          dataset_id)
-    self.assertDatasetProduces(ds, list(range(num_elements)))
-
-  @combinations.generate(test_base.eager_only_combinations())
-  def testElementSpecMixedMode(self):
-    cluster = data_service_test_base.TestCluster(
-        num_workers=1, work_dir=NO_WORK_DIR, fault_tolerant_mode=False)
-    num_elements = 10
-    ds = dataset_ops.Dataset.range(num_elements)
-
-    @def_function.function
-    def get_dataset_id():
-      return data_service_ops.register_dataset(cluster.dispatcher_address(), ds)
-
-    dataset_id = get_dataset_id()
-    dataset_id_val = tensor_util.constant_value(dataset_id)
-
-    with self.assertRaisesRegex(
-        ValueError, "Failed to fetch element spec for dataset id " +
-        str(dataset_id_val) + " from tf.data service. If the "
-        "dataset was registered in graph mode or inside a "
-        "tf.function, the `element_spec` must be specified as "
-        "an argument to `from_dataset_id`."):
-      ds = data_service_ops.from_dataset_id("parallel_epochs",
-                                            cluster.dispatcher_address(),
-                                            dataset_id)
-
   @combinations.generate(test_base.default_test_combinations())
   def testNoShardingPolicy(self):
     cluster = data_service_test_base.TestCluster(num_workers=1)
@@ -996,12 +861,6 @@ def testNoShardingPolicy(self):
         dataset, cluster=cluster, processing_mode=ShardingPolicy.OFF)
     self.assertDatasetProduces(dataset, list(range(20)))
 
-  @combinations.generate(test_base.default_test_combinations())
-  def testCardinality(self):
-    cluster = data_service_test_base.TestCluster(num_workers=1)
-    dataset = self.make_distributed_range_dataset(10, cluster)
-    self.assertEqual(self.evaluate(dataset.cardinality()), dataset_ops.UNKNOWN)
-
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/metadata_test.py b/tensorflow/python/data/experimental/kernel_tests/service/metadata_test.py
new file mode 100644
index 00000000000000..485d67cedb5e02
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/service/metadata_test.py
@@ -0,0 +1,269 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.data service metadata."""
+
+import functools
+from unittest import mock
+
+from absl.testing import parameterized
+
+from tensorflow.python.compat import compat
+from tensorflow.python.data.experimental.kernel_tests.service import test_base as data_service_test_base
+from tensorflow.python.data.experimental.ops import data_service_ops
+from tensorflow.python.data.experimental.ops import distribute
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import script_ops
+from tensorflow.python.platform import test
+
+
+def _cardinality_test_combinations():
+  """Generate test combinations for data service cardinality tests.
+
+  We test only V2 combinations for the infinite and 0 cases because the `map`
+  transformation for compression makes the cardinality unknown in TF1.
+
+  Returns:
+    test combinations.
+  """
+
+  def _reduce_cases_to_combinations(result, case):
+    name, dataset_fn, sharding_policy, expected_result = case
+    return result + combinations.combine(
+        dataset_fn=combinations.NamedObject(name, dataset_fn),
+        sharding_policy=sharding_policy,
+        expected_result=expected_result)
+
+  def _cases_to_combinations(cases):
+    return functools.reduce(_reduce_cases_to_combinations, cases, [])
+
+  def _infinite_dataset_with_hint_shard():
+    return (dataset_ops.Dataset.range(10).shard(distribute.SHARD_HINT,
+                                                distribute.SHARD_HINT).repeat())
+
+  def _empty_dataset_with_hint_shard():
+    return (dataset_ops.Dataset.range(0).shard(distribute.SHARD_HINT,
+                                               distribute.SHARD_HINT))
+
+  v2_only_cases = [
+      ("NoShardingInfinite", lambda: dataset_ops.Dataset.range(10).repeat(),
+       data_service_ops.ShardingPolicy.OFF, dataset_ops.INFINITE),
+      ("DynamicShardingInfinite", lambda: dataset_ops.Dataset.range(5).repeat(),
+       data_service_ops.ShardingPolicy.DYNAMIC, dataset_ops.INFINITE),
+      ("DataShardingInfinite", lambda: dataset_ops.Dataset.range(10).repeat(),
+       data_service_ops.ShardingPolicy.DATA, dataset_ops.INFINITE),
+      ("NoShardingZero", lambda: dataset_ops.Dataset.range(0),
+       data_service_ops.ShardingPolicy.OFF, 0),
+      ("DynamicShardingZero", lambda: dataset_ops.Dataset.range(0),
+       data_service_ops.ShardingPolicy.DYNAMIC, 0),
+      ("DataShardingZero", lambda: dataset_ops.Dataset.range(0),
+       data_service_ops.ShardingPolicy.DATA, 0),
+      ("FileOrDataShardingZero", lambda: dataset_ops.Dataset.range(0),
+       data_service_ops.ShardingPolicy.FILE_OR_DATA, 0),
+      ("HintShardingZero", _empty_dataset_with_hint_shard,
+       data_service_ops.ShardingPolicy.HINT, dataset_ops.UNKNOWN),
+  ]
+  v1_and_v2_cases = [
+      ("Finite", lambda: dataset_ops.Dataset.range(10),
+       data_service_ops.ShardingPolicy.OFF, dataset_ops.UNKNOWN),
+      ("FileOrDataShardingUnknown",
+       lambda: dataset_ops.Dataset.range(10).repeat(),
+       data_service_ops.ShardingPolicy.FILE_OR_DATA, dataset_ops.UNKNOWN),
+      ("HintShardingUnknown", _infinite_dataset_with_hint_shard,
+       data_service_ops.ShardingPolicy.HINT, dataset_ops.UNKNOWN),
+  ]
+
+  v2_only_combinations = combinations.times(
+      combinations.combine(tf_api_version=2, mode=["eager", "graph"]),
+      _cases_to_combinations(v2_only_cases))
+  v1_and_v2_combinations = combinations.times(
+      combinations.combine(tf_api_version=[1, 2], mode=["eager", "graph"]),
+      _cases_to_combinations(v1_and_v2_cases))
+  return v2_only_combinations + v1_and_v2_combinations
+
+
+class DataServiceMetadataTest(data_service_test_base.TestBase,
+                              parameterized.TestCase):
+  """Tests propagating data service metadata through tf.data service."""
+
+  @combinations.generate(_cardinality_test_combinations())
+  def testCardinality(self, dataset_fn, sharding_policy, expected_result):
+    cluster = data_service_test_base.TestCluster(num_workers=2)
+    dataset = dataset_fn()
+    dataset = self.make_distributed_dataset(
+        dataset, cluster=cluster, processing_mode=sharding_policy)
+    self.assertEqual(self.evaluate(dataset.cardinality()), expected_result)
+
+  @combinations.generate(_cardinality_test_combinations())
+  def testFromDatasetIdCardinality(self, dataset_fn, sharding_policy,
+                                   expected_result):
+    cluster = data_service_test_base.TestCluster(num_workers=2)
+    dataset = dataset_fn()
+    dataset_id = data_service_ops.register_dataset(
+        cluster.dispatcher.target, dataset=dataset)
+    dataset = data_service_ops.from_dataset_id(
+        processing_mode=sharding_policy,
+        service=cluster.dispatcher.target,
+        dataset_id=dataset_id,
+        element_spec=dataset.element_spec)
+    self.assertEqual(self.evaluate(dataset.cardinality()), expected_result)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testFromDatasetIdDoesntRequireElementSpec(self):
+    cluster = data_service_test_base.TestCluster(
+        num_workers=1,
+        work_dir=data_service_test_base.NO_WORK_DIR,
+        fault_tolerant_mode=False,
+        data_transfer_protocol="grpc")
+    num_elements = 10
+    dataset = dataset_ops.Dataset.range(num_elements)
+
+    dataset_id = data_service_ops.register_dataset(cluster.dispatcher_address(),
+                                                   dataset)
+    dataset = data_service_ops.from_dataset_id(
+        processing_mode=data_service_ops.ShardingPolicy.OFF,
+        service=cluster.dispatcher_address(),
+        dataset_id=dataset_id)
+    self.assertDatasetProduces(dataset, list(range(num_elements)))
+
+  @combinations.generate(test_base.graph_only_combinations())
+  def testElementSpecGraphMode(self):
+    cluster = data_service_test_base.TestCluster(
+        num_workers=1,
+        work_dir=data_service_test_base.NO_WORK_DIR,
+        fault_tolerant_mode=False)
+    num_elements = 10
+    dataset = dataset_ops.Dataset.range(num_elements)
+    dataset_id = data_service_ops.register_dataset(cluster.dispatcher_address(),
+                                                   dataset)
+    with self.assertRaisesRegex(
+        ValueError, "In graph mode `element_spec` must be provided manually."):
+      _ = data_service_ops.from_dataset_id(
+          processing_mode=data_service_ops.ShardingPolicy.OFF,
+          service=cluster.dispatcher_address(),
+          dataset_id=dataset_id)
+
+  @combinations.generate(test_base.eager_only_combinations())
+  def testElementSpecMixedMode(self):
+    cluster = data_service_test_base.TestCluster(
+        num_workers=1,
+        work_dir=data_service_test_base.NO_WORK_DIR,
+        fault_tolerant_mode=False)
+    num_elements = 10
+    dataset = dataset_ops.Dataset.range(num_elements)
+
+    @def_function.function
+    def get_dataset_id():
+      return data_service_ops.register_dataset(cluster.dispatcher_address(),
+                                               dataset)
+
+    dataset_id = get_dataset_id()
+    dataset_id_val = tensor_util.constant_value(dataset_id)
+
+    with self.assertRaisesRegex(
+        ValueError,
+        f"Failed to fetch element spec for dataset id {dataset_id_val} from "
+        "tf.data service. If the dataset was registered in graph mode or "
+        "inside a tf.function, the `element_spec` must be specified as an "
+        "argument to `from_dataset_id`."):
+      dataset = data_service_ops.from_dataset_id(
+          processing_mode=data_service_ops.ShardingPolicy.OFF,
+          service=cluster.dispatcher_address(),
+          dataset_id=dataset_id)
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations(),
+                         combinations.combine(compression=[None, "AUTO"])))
+  def testFromDatasetIdOmitsCompression(self, compression):
+    cluster = data_service_test_base.TestCluster(
+        num_workers=1, data_transfer_protocol="grpc")
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        list("abcdefghijklmnopqrstuvwxyz"))
+    def to_upper(x):
+      return script_ops.numpy_function(
+          func=lambda x: x.decode("utf-8").upper(), inp=[x], Tout=dtypes.string)
+    dataset = dataset.map(to_upper, num_parallel_calls=dataset_ops.AUTOTUNE)
+    with mock.patch.object(compat, "forward_compatible", return_value=True):
+      dataset_id = data_service_ops.register_dataset(
+          cluster.dispatcher.target, dataset=dataset, compression=compression)
+      dataset = data_service_ops.from_dataset_id(
+          processing_mode=data_service_ops.ShardingPolicy.OFF,
+          service=cluster.dispatcher.target,
+          dataset_id=dataset_id,
+          element_spec=dataset.element_spec)
+      self.assertDatasetProduces(dataset, list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
+
+  # Eager-only as querying `element_spec` is only supported in the eager mode.
+  @combinations.generate(
+      combinations.times(test_base.eager_only_combinations(),
+                         combinations.combine(compression=[None, "AUTO"])))
+  def testFromDatasetIdOmitsElementSpecAndCompression(self, compression):
+    cluster = data_service_test_base.TestCluster(
+        num_workers=1, data_transfer_protocol="grpc")
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
+    with mock.patch.object(compat, "forward_compatible", return_value=True):
+      dataset_id = data_service_ops.register_dataset(
+          cluster.dispatcher.target, dataset=dataset, compression=compression)
+      dataset = data_service_ops.from_dataset_id(
+          processing_mode=data_service_ops.ShardingPolicy.OFF,
+          service=cluster.dispatcher.target,
+          dataset_id=dataset_id)
+      self.assertDatasetProduces(dataset, list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
+
+  def _testCompressionMismatch(self, dataset):
+    cluster = data_service_test_base.TestCluster(
+        num_workers=1, data_transfer_protocol="grpc")
+    with mock.patch.object(compat, "forward_compatible", return_value=False):
+      dataset_id = data_service_ops._register_dataset(
+          cluster.dispatcher.target, dataset=dataset, compression=None)
+      # `compression` is "AUTO" by default.
+      dataset = data_service_ops._from_dataset_id(
+          processing_mode=data_service_ops.ShardingPolicy.OFF,
+          service=cluster.dispatcher.target,
+          dataset_id=dataset_id,
+          element_spec=dataset.element_spec)
+      with self.assertRaises(errors.InvalidArgumentError):
+        self.getDatasetOutput(dataset)
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations()))
+  def testCompressionDtypeMismatch(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"))
+    self._testCompressionMismatch(dataset)
+
+  @combinations.generate(
+      combinations.times(test_base.default_test_combinations()))
+  def testCompressionShapeMismatch(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices([[1, 2], [3, 4]])
+    self._testCompressionMismatch(dataset)
+
+  # Only test eager mode since nested datasets are not allowed in graph mode.
+  @combinations.generate(
+      combinations.times(test_base.eager_only_combinations()))
+  def testCompressionVariantMismatch(self):
+    # Use a nested dataset as an example of a variant.
+    dataset = dataset_ops.Dataset.from_tensors(dataset_ops.Dataset.range(10))
+    self._testCompressionMismatch(dataset)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/ops/data_service_ops.py b/tensorflow/python/data/experimental/ops/data_service_ops.py
index 7276eaa143d3cb..7c849035a8c768 100644
--- a/tensorflow/python/data/experimental/ops/data_service_ops.py
+++ b/tensorflow/python/data/experimental/ops/data_service_ops.py
@@ -281,6 +281,8 @@ def __init__(self,
     if compat.forward_compatible(2021, 12, 10):
       self._element_spec = element_spec
     else:
+      # If we compress, the data service side dataset will produce scalar
+      # variants.
       self._element_spec = (
           tensor_spec.TensorSpec(shape=(), dtype=dtypes.variant)
           if compression == COMPRESSION_AUTO else element_spec)
@@ -989,9 +991,7 @@ def _legacy_get_element_spec():
     else:
       element_spec = _legacy_get_element_spec()
 
-  # If we compress, the data service side dataset will produce scalar variants.
   compression = _decide_compression(compression, data_transfer_protocol)
-
   dataset = _DataServiceDataset(
       dataset_id=dataset_id,
       processing_mode=processing_mode,
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 162dc869b39291..b8e3fa6ac805c4 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -789,7 +789,7 @@ tf_py_test(
 
 tf_py_test(
     name = "repeat_test",
-    size = "small",
+    size = "medium",
     srcs = ["repeat_test.py"],
     deps = [
         ":checkpoint_test_base",
diff --git a/tensorflow/python/data/kernel_tests/dataset_spec_test.py b/tensorflow/python/data/kernel_tests/dataset_spec_test.py
index b5ca8d27d0ec1f..b0141217d97653 100644
--- a/tensorflow/python/data/kernel_tests/dataset_spec_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_spec_test.py
@@ -51,6 +51,27 @@ def testDatasetSpecInnerSpec(self):
     ds_spec = dataset_ops.DatasetSpec(inner_spec)
     self.assertEqual(ds_spec.element_spec, inner_spec)
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testDatasetSpecTraceType(self):
+    trace_type_1 = dataset_ops.DatasetSpec(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.int32),
+        [5]).__tf_tracing_type__(None)
+    trace_type_2 = dataset_ops.DatasetSpec(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.int32),
+        [5]).__tf_tracing_type__(None)
+
+    self.assertEqual(trace_type_1, trace_type_2)
+    self.assertEqual(hash(trace_type_1), hash(trace_type_2))
+    self.assertTrue(trace_type_1.is_subtype_of(trace_type_2))
+    self.assertTrue(trace_type_2.is_subtype_of(trace_type_1))
+
+    trace_type_3 = dataset_ops.DatasetSpec(
+        tensor_spec.TensorSpec(shape=(), dtype=dtypes.int32),
+        [6]).__tf_tracing_type__(None)
+    self.assertNotEqual(trace_type_1, trace_type_3)
+    self.assertFalse(trace_type_1.is_subtype_of(trace_type_3))
+    self.assertFalse(trace_type_3.is_subtype_of(trace_type_1))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/iterator_test.py b/tensorflow/python/data/kernel_tests/iterator_test.py
index 983fd31b3e60d1..03851b2fe971f8 100644
--- a/tensorflow/python/data/kernel_tests/iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_test.py
@@ -49,6 +49,7 @@
 from tensorflow.python.util import compat
 
 
+@test_util.with_eager_op_as_function
 class IteratorTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.graph_only_combinations())
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 6f0e0a30b7f51b..aaf7d541413c17 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -29,7 +29,6 @@
 from tensorflow.core.framework import dataset_options_pb2
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python import tf2
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import options as options_lib
 from tensorflow.python.data.ops import structured_function
@@ -67,6 +66,7 @@
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.training.tracking import base as tracking_base
 from tensorflow.python.training.tracking import tracking
+from tensorflow.python.types import trace
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import lazy_loader
 from tensorflow.python.util import nest as tf_nest
@@ -368,9 +368,7 @@ def _trace_variant_creation(self):
     output_node_name = output_node_names[0]
 
     file_path_nodes = {}
-    # TODO(b/188455028): Remove this check when
-    # `CapturableResource._map_resources` take an argument to provide the
-    # re-mapped asset tensor object instead of the original eager one.
+    # When building a tf.function, track files as `saved_model.Asset`s.
     if ops.get_default_graph().building_function:
       asset_tracker = self._maybe_track_assets(graph_def)
       for key in asset_tracker:
@@ -643,6 +641,27 @@ def _flat_structure(self):
         "output_types": self._flat_types,
     }
 
+  @property
+  def _common_args(self):
+    """Helper for generating arguments that are common across most dataset ops.
+
+    Most dataset op constructors expect `output_shapes` and `output_types`
+    arguments that represent the flattened structure of an element, as well as a
+    `metadata` argument for additional metadata such as user-defined dataset
+    name. This helper function generates common attributes as a keyword argument
+    dictionary, allowing `Dataset._variant_tensor` implementations to pass
+    `**self._common_args` to the op constructor.
+
+    Returns:
+      A dictionary of keyword arguments that can be passed to a dataset op
+      constructor.
+    """
+    return {
+        "metadata": self._metadata.SerializeToString(),
+        "output_shapes": self._flat_shapes,
+        "output_types": self._flat_types,
+    }
+
   @property
   def _type_spec(self):
     return DatasetSpec(self.element_spec)
@@ -2474,9 +2493,6 @@ def reduce(self, initial_state, reduce_func, name=None):
     metadata = dataset_metadata_pb2.Metadata()
     if name:
       metadata.name = _validate_and_encode(name)
-    kwargs = {}
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = metadata.SerializeToString()
     return structure.from_compatible_tensor_list(
         state_structure,
         gen_dataset_ops.reduce_dataset(
@@ -2486,7 +2502,7 @@ def reduce(self, initial_state, reduce_func, name=None):
             f=reduce_func,
             output_shapes=structure.get_flat_tensor_shapes(state_structure),
             output_types=structure.get_flat_tensor_types(state_structure),
-            **kwargs))
+            metadata=metadata.SerializeToString()))
 
   def get_single_element(self, name=None):
     """Returns the single element of the `dataset`.
@@ -2608,13 +2624,12 @@ def preprocessing_fn(raw_feature):
     metadata = dataset_metadata_pb2.Metadata()
     if name:
       metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = metadata.SerializeToString()
     return structure.from_compatible_tensor_list(
         self.element_spec,
-        gen_dataset_ops.dataset_to_single_element(self._variant_tensor,
-                                                  **kwargs))  # pylint: disable=protected-access
+        gen_dataset_ops.dataset_to_single_element(
+            self._variant_tensor,
+            metadata=metadata.SerializeToString(),
+            **self._flat_structure))  # pylint: disable=protected-access
 
   def unbatch(self, name=None):
     """Splits elements of a dataset into multiple elements.
@@ -4248,6 +4263,35 @@ def to_variant(dataset):
   return dataset._variant_tensor  # pylint: disable=protected-access
 
 
+# TODO(b/202447704): Merge into DatasetSpec.
+class DatasetSpecTraceType(trace.TraceType):
+  """Defines the Tracing Protocol for Dataset objects.
+
+  The default TraceType supplied by TypeSpec does not take into account
+  `element_spec` and therefore reuses concrete functions for cases where
+  the `element_spec` is different.
+  """
+
+  def __init__(self, element_spec, dataset_shape):
+    self._components = (element_spec, tuple(dataset_shape.as_list()))
+
+  def is_subtype_of(self, other):
+    return self == other
+
+  def most_specific_common_supertype(self, others):
+    return None
+
+  def __hash__(self):
+    return hash(DatasetSpecTraceType)
+
+  def __eq__(self, other):
+    if not isinstance(other, trace.TraceType):
+      return NotImplemented
+
+    return isinstance(
+        other, DatasetSpecTraceType) and self._components == other._components
+
+
 @tf_export(
     "data.DatasetSpec",
     v1=["data.DatasetSpec", "data.experimental.DatasetStructure"])
@@ -4328,6 +4372,9 @@ def _to_legacy_output_shapes(self):
   def _to_legacy_output_classes(self):
     return self
 
+  def __tf_tracing_type__(self, _):
+    return DatasetSpecTraceType(self._element_spec, self._dataset_shape)
+
 
 class _NumpyIterator(object):
   """Iterator over a dataset with elements converted to numpy."""
@@ -4392,13 +4439,10 @@ def __init__(self, element, name=None):
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = {}
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = gen_dataset_ops.tensor_dataset(
         self._tensors,
         output_shapes=structure.get_flat_tensor_shapes(self._structure),
-        **kwargs)
+        metadata=self._metadata.SerializeToString())
     super(TensorDataset, self).__init__(variant_tensor)
 
   @property
@@ -4429,15 +4473,11 @@ def __init__(self, element, is_files=False, name=None):
           tensor_shape.Dimension(
               tensor_shape.dimension_value(t.get_shape()[0])))
 
-    kwargs = {
-        "output_shapes": structure.get_flat_tensor_shapes(self._structure)
-    }
-    if compat.forward_compatible(2021, 9, 20):
-      kwargs["is_files"] = is_files
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = gen_dataset_ops.tensor_slice_dataset(
-        self._tensors, **kwargs)
+        self._tensors,
+        output_shapes=structure.get_flat_tensor_shapes(self._structure),
+        is_files=is_files,
+        metadata=self._metadata.SerializeToString())
     super(TensorSliceDataset, self).__init__(variant_tensor)
 
   @property
@@ -4525,9 +4565,6 @@ def __init__(self,
     if name:
       self._metadata.name = _validate_and_encode(name)
 
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = gen_dataset_ops.generator_dataset(
         structure.to_tensor_list(self._init_structure, self._init_args) +
         self._init_func.function.captured_inputs,
@@ -4536,7 +4573,7 @@ def __init__(self,
         init_func=self._init_func.function,
         next_func=self._next_func.function,
         finalize_func=self._finalize_func.function,
-        **kwargs)
+        **self._common_args)
     super(_GeneratorDataset, self).__init__(variant_tensor)
 
   @property
@@ -4570,11 +4607,9 @@ def __init__(self, datasets, name=None):
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = gen_dataset_ops.zip_dataset(
-        [ds._variant_tensor for ds in nest.flatten(self._datasets)], **kwargs)
+        [ds._variant_tensor for ds in nest.flatten(self._datasets)],
+        **self._common_args)
     super(ZipDataset, self).__init__(variant_tensor)
 
   def _inputs(self):
@@ -4607,13 +4642,10 @@ def __init__(self, input_dataset, dataset_to_concatenate, name=None):
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     # pylint: disable=protected-access
     variant_tensor = gen_dataset_ops.concatenate_dataset(
         input_dataset._variant_tensor, dataset_to_concatenate._variant_tensor,
-        **kwargs)
+        **self._common_args)
     # pylint: enable=protected-access
     super(ConcatenateDataset, self).__init__(variant_tensor)
 
@@ -4639,13 +4671,10 @@ def __init__(self, input_dataset, count, name=None):
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = gen_dataset_ops.repeat_dataset(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
         count=self._count,
-        **kwargs)
+        **self._common_args)
     super(RepeatDataset, self).__init__(input_dataset, variant_tensor)
 
 
@@ -4656,11 +4685,11 @@ def __init__(self, *args, **kwargs):
     """See `Dataset.range()` for details."""
     self._parse_args(*args, **kwargs)
     self._structure = tensor_spec.TensorSpec([], self._output_type)
-    kwargs = self._flat_structure
-    if self._metadata.name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = gen_dataset_ops.range_dataset(
-        start=self._start, stop=self._stop, step=self._step, **kwargs)
+        start=self._start,
+        stop=self._stop,
+        step=self._step,
+        **self._common_args)
     super(RangeDataset, self).__init__(variant_tensor)
 
   def _parse_args(self, *args, **kwargs):
@@ -4707,20 +4736,17 @@ def __init__(self, input_dataset, filename, name=None):
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     if tf2.enabled() and (context.executing_eagerly() or ops.inside_function()):
       variant_tensor = gen_dataset_ops.cache_dataset_v2(
           input_dataset._variant_tensor,  # pylint: disable=protected-access
           filename=self._filename,
           cache=gen_dataset_ops.dummy_memory_cache(),
-          **kwargs)
+          **self._common_args)
     else:
       variant_tensor = gen_dataset_ops.cache_dataset(
           input_dataset._variant_tensor,  # pylint: disable=protected-access
           filename=self._filename,
-          **kwargs)
+          **self._common_args)
     super(CacheDataset, self).__init__(input_dataset, variant_tensor)
 
 
@@ -4744,9 +4770,6 @@ def __init__(self,
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
 
     if (tf2.enabled() and
         (context.executing_eagerly() or ops.inside_function())):
@@ -4757,7 +4780,7 @@ def __init__(self,
           seed2=self._seed2,
           seed_generator=gen_dataset_ops.dummy_seed_generator(),
           reshuffle_each_iteration=self._reshuffle_each_iteration,
-          **kwargs)
+          **self._common_args)
     else:
       variant_tensor = gen_dataset_ops.shuffle_dataset(
           input_dataset._variant_tensor,  # pylint: disable=protected-access
@@ -4765,7 +4788,7 @@ def __init__(self,
           seed=self._seed,
           seed2=self._seed2,
           reshuffle_each_iteration=self._reshuffle_each_iteration,
-          **kwargs)
+          **self._common_args)
     super(ShuffleDataset, self).__init__(input_dataset, variant_tensor)
 
 
@@ -4779,13 +4802,10 @@ def __init__(self, input_dataset, count, name=None):
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = gen_dataset_ops.take_dataset(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
         count=self._count,
-        **kwargs)
+        **self._common_args)
     super(TakeDataset, self).__init__(input_dataset, variant_tensor)
 
 
@@ -4799,13 +4819,10 @@ def __init__(self, input_dataset, count, name=None):
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = gen_dataset_ops.skip_dataset(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
         count=self._count,
-        **kwargs)
+        **self._common_args)
     super(SkipDataset, self).__init__(input_dataset, variant_tensor)
 
 
@@ -4821,14 +4838,11 @@ def __init__(self, input_dataset, num_shards, index, name=None):
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = gen_dataset_ops.shard_dataset(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
         num_shards=self._num_shards,
         index=self._index,
-        **kwargs)
+        **self._common_args)
     super(ShardDataset, self).__init__(input_dataset, variant_tensor)
 
 
@@ -4861,14 +4875,11 @@ def __init__(self, input_dataset, batch_size, drop_remainder, name=None):
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = gen_dataset_ops.batch_dataset_v2(
         input_dataset._variant_tensor,
         batch_size=self._batch_size,
         drop_remainder=self._drop_remainder,
-        **kwargs)
+        **self._common_args)
     super(BatchDataset, self).__init__(input_dataset, variant_tensor)
 
   @property
@@ -4919,16 +4930,13 @@ def __init__(self,
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = gen_dataset_ops.parallel_batch_dataset(
         input_dataset._variant_tensor,
         batch_size=self._batch_size,
         num_parallel_calls=self._num_parallel_calls,
         drop_remainder=self._drop_remainder,
         deterministic=self._deterministic,
-        **kwargs)
+        **self._common_args)
 
     super(ParallelBatchDataset, self).__init__(input_dataset, variant_tensor)
 
@@ -5129,9 +5137,6 @@ def _padded_shape_to_batch_shape(s):
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = {}
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = gen_dataset_ops.padded_batch_dataset_v2(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
         batch_size=self._batch_size,
@@ -5142,7 +5147,7 @@ def _padded_shape_to_batch_shape(s):
         padding_values=nest.flatten(self._padding_values),
         drop_remainder=self._drop_remainder,
         output_shapes=structure.get_flat_tensor_shapes(self._structure),
-        **kwargs)
+        metadata=self._metadata.SerializeToString())
     super(PaddedBatchDataset, self).__init__(input_dataset, variant_tensor)
 
   @property
@@ -5172,16 +5177,13 @@ def __init__(self,
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = gen_dataset_ops.map_dataset(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._map_func.function.captured_inputs,
         f=self._map_func.function,
         use_inter_op_parallelism=self._use_inter_op_parallelism,
         preserve_cardinality=self._preserve_cardinality,
-        **kwargs)
+        **self._common_args)
     super(MapDataset, self).__init__(input_dataset, variant_tensor)
 
   def _functions(self):
@@ -5227,9 +5229,6 @@ def __init__(self,
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = gen_dataset_ops.parallel_map_dataset_v2(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._map_func.function.captured_inputs,
@@ -5238,7 +5237,7 @@ def __init__(self,
         deterministic=self._deterministic,
         use_inter_op_parallelism=self._use_inter_op_parallelism,
         preserve_cardinality=self._preserve_cardinality,
-        **kwargs)
+        **self._common_args)
     super(ParallelMapDataset, self).__init__(input_dataset, variant_tensor)
 
   def _functions(self):
@@ -5268,14 +5267,11 @@ def __init__(self, input_dataset, map_func, name=None):
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = gen_dataset_ops.flat_map_dataset(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._map_func.function.captured_inputs,
         f=self._map_func.function,
-        **kwargs)
+        **self._common_args)
     super(FlatMapDataset, self).__init__(input_dataset, variant_tensor)
 
   def _functions(self):
@@ -5315,16 +5311,13 @@ def __init__(self,
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = gen_dataset_ops.interleave_dataset(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._map_func.function.captured_inputs,  # pylint: disable=protected-access
         self._cycle_length,
         self._block_length,
         f=self._map_func.function,
-        **kwargs)
+        **self._common_args)
     super(InterleaveDataset, self).__init__(input_dataset, variant_tensor)
 
   def _functions(self):
@@ -5385,9 +5378,6 @@ def __init__(self,
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = gen_dataset_ops.parallel_interleave_dataset_v4(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._map_func.function.captured_inputs,  # pylint: disable=protected-access
@@ -5398,7 +5388,7 @@ def __init__(self,
         self._num_parallel_calls,
         f=self._map_func.function,
         deterministic=deterministic_string,
-        **kwargs)
+        **self._common_args)
     super(ParallelInterleaveDataset, self).__init__(input_dataset,
                                                     variant_tensor)
 
@@ -5437,14 +5427,11 @@ def __init__(self,
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = gen_dataset_ops.filter_dataset(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
         other_arguments=self._predicate.function.captured_inputs,
         predicate=self._predicate.function,
-        **kwargs)
+        **self._common_args)
     super(FilterDataset, self).__init__(input_dataset, variant_tensor)
 
   def _functions(self):
@@ -5467,9 +5454,6 @@ def __init__(self, input_dataset, buffer_size, slack_period=None, name=None):
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     # pylint: disable=protected-access
     # We colocate the prefetch dataset with its input as this collocation only
     # happens automatically in graph mode.
@@ -5478,7 +5462,7 @@ def __init__(self, input_dataset, buffer_size, slack_period=None, name=None):
           input_dataset._variant_tensor,
           buffer_size=self._buffer_size,
           slack_period=slack_period,
-          **kwargs)
+          **self._common_args)
     super(PrefetchDataset, self).__init__(input_dataset, variant_tensor)
 
 
@@ -5513,16 +5497,13 @@ def __init__(self,
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = gen_dataset_ops.window_dataset(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
         size=self._size,
         shift=self._shift,
         stride=self._stride,
         drop_remainder=self._drop_remainder,
-        **kwargs)
+        **self._common_args)
     super(WindowDataset, self).__init__(input_dataset, variant_tensor)
 
   @property
@@ -5541,13 +5522,10 @@ def __init__(self, input_dataset, options, name=None):
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     with ops.colocate_with(input_dataset._variant_tensor):
       variant_tensor = gen_dataset_ops.options_dataset(
           input_dataset._variant_tensor, options_pb.SerializeToString(),
-          **kwargs)
+          **self._common_args)
     super(_OptionsDataset, self).__init__(input_dataset, variant_tensor)
 
     if self._options_attr:
@@ -5630,12 +5608,9 @@ def __init__(self, input_dataset, name=None):
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = ged_ops.unbatch_dataset(
         self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-        **kwargs)
+        **self._common_args)
     super(_UnbatchDataset, self).__init__(input_dataset, variant_tensor)
 
   @property
@@ -5660,9 +5635,6 @@ def __init__(self,
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = ged_ops.group_by_window_dataset(
         self._input_dataset._variant_tensor,  # pylint: disable=protected-access
         self._key_func.function.captured_inputs,
@@ -5671,7 +5643,7 @@ def __init__(self,
         key_func=self._key_func.function,
         reduce_func=self._reduce_func.function,
         window_size_func=self._window_size_func.function,
-        **kwargs)
+        **self._common_args)
     super(_GroupByWindowDataset, self).__init__(input_dataset, variant_tensor)
 
   def _make_window_size_func(self, window_size_func):
@@ -5740,11 +5712,8 @@ def __init__(self, seed=None, name=None):
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = ged_ops.random_dataset(
-        seed=self._seed, seed2=self._seed2, **kwargs)
+        seed=self._seed, seed2=self._seed2, **self._common_args)
     super(RandomDataset, self).__init__(variant_tensor)
 
   @property
@@ -5986,14 +5955,11 @@ def __init__(self, input_dataset, predicate, name=None):
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = ged_ops.take_while_dataset(
         self._input_dataset._variant_tensor,  # pylint: disable=protected-access
         other_arguments=self._predicate.function.captured_inputs,
         predicate=self._predicate.function,
-        **kwargs)
+        **self._common_args)
     super(_TakeWhileDataset, self).__init__(input_dataset, variant_tensor)
 
   def _functions(self):
@@ -6017,12 +5983,9 @@ def __init__(self, input_dataset, name=None):
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = ged_ops.unique_dataset(
         self._input_dataset._variant_tensor,  # pylint: disable=protected-access
-        **kwargs)
+        **self._common_args)
     super(_UniqueDataset, self).__init__(input_dataset, variant_tensor)
 
 
@@ -6072,9 +6035,6 @@ def __init__(self,
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     variant_tensor = ged_ops.snapshot_dataset_v2(
         input_dataset._variant_tensor,  # pylint: disable=protected-access
         path,
@@ -6083,7 +6043,7 @@ def __init__(self,
         compression=compression,
         reader_func=self._reader_func.function,
         shard_func=self._shard_func.function,
-        **kwargs)
+        **self._common_args)
     super(_SnapshotDataset, self).__init__(input_dataset, variant_tensor)
 
   def _functions(self):
@@ -6193,9 +6153,6 @@ def __init__(self,
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = _validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
     # pylint: disable=protected-access
     if use_default_device is not None:
       variant_tensor = ged_ops.scan_dataset(
@@ -6205,7 +6162,7 @@ def __init__(self,
           f=self._scan_func.function,
           preserve_cardinality=True,
           use_default_device=use_default_device,
-          **kwargs)
+          **self._common_args)
     else:
       variant_tensor = ged_ops.scan_dataset(
           self._input_dataset._variant_tensor,
@@ -6213,7 +6170,7 @@ def __init__(self,
           self._scan_func.function.captured_inputs,
           f=self._scan_func.function,
           preserve_cardinality=True,
-          **kwargs)
+          **self._common_args)
     super(_ScanDataset, self).__init__(input_dataset, variant_tensor)
 
   def _functions(self):
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index ba590f61a901d5..d335c1f6796619 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -17,7 +17,6 @@
 
 from tensorflow.core.framework import dataset_metadata_pb2
 from tensorflow.python import tf2
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import structured_function
 from tensorflow.python.data.util import convert
@@ -155,14 +154,12 @@ def __init__(self,
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = dataset_ops._validate_and_encode(name)
-    kwargs = {}
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
-
-    variant_tensor = gen_dataset_ops.text_line_dataset(self._filenames,
-                                                       self._compression_type,
-                                                       self._buffer_size,
-                                                       **kwargs)
+
+    variant_tensor = gen_dataset_ops.text_line_dataset(
+        self._filenames,
+        self._compression_type,
+        self._buffer_size,
+        metadata=self._metadata.SerializeToString())
     super(_TextLineDataset, self).__init__(variant_tensor)
 
   @property
@@ -311,14 +308,10 @@ def __init__(self,
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = dataset_ops._validate_and_encode(name)
-    kwargs = {}
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
-
-    variant_tensor = gen_dataset_ops.tf_record_dataset(self._filenames,
-                                                       self._compression_type,
-                                                       self._buffer_size,
-                                                       **kwargs)
+
+    variant_tensor = gen_dataset_ops.tf_record_dataset(
+        self._filenames, self._compression_type, self._buffer_size,
+        metadata=self._metadata.SerializeToString())
     super(_TFRecordDataset, self).__init__(variant_tensor)
 
   @property
@@ -368,9 +361,6 @@ def __init__(self,
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = dataset_ops._validate_and_encode(name)
-    kwargs = self._flat_structure
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
 
     variant_tensor = ged_ops.legacy_parallel_interleave_dataset_v2(
         self._input_dataset._variant_tensor,  # pylint: disable=protected-access
@@ -381,7 +371,7 @@ def __init__(self,
         self._prefetch_input_elements,
         f=self._map_func.function,
         deterministic=self._deterministic,
-        **kwargs)
+        **self._common_args)
     super(ParallelInterleaveDataset, self).__init__(input_dataset,
                                                     variant_tensor)
 
@@ -565,13 +555,15 @@ def __init__(self,
     self._metadata = dataset_metadata_pb2.Metadata()
     if name:
       self._metadata.name = dataset_ops._validate_and_encode(name)
-    kwargs = {}
-    if name or compat.forward_compatible(2021, 9, 30):
-      kwargs["metadata"] = self._metadata.SerializeToString()
 
     variant_tensor = gen_dataset_ops.fixed_length_record_dataset_v2(
-        self._filenames, self._header_bytes, self._record_bytes,
-        self._footer_bytes, self._buffer_size, self._compression_type, **kwargs)
+        self._filenames,
+        self._header_bytes,
+        self._record_bytes,
+        self._footer_bytes,
+        self._buffer_size,
+        self._compression_type,
+        metadata=self._metadata.SerializeToString())
     super(_FixedLengthRecordDataset, self).__init__(variant_tensor)
 
   @property
diff --git a/tensorflow/python/debug/lib/BUILD b/tensorflow/python/debug/lib/BUILD
index 863e02ff9bc56b..578856efb730cd 100644
--- a/tensorflow/python/debug/lib/BUILD
+++ b/tensorflow/python/debug/lib/BUILD
@@ -290,6 +290,7 @@ cuda_py_test(
     python_version = "PY3",
     tags = [
         "no_mac",  # TODO(b/175322370): Detected Infinity or NaN in output 0 of graph op "RealDiv"
+        "no_oss_py310",  # b/209089624
         "no_windows",
     ],
     deps = [
@@ -394,6 +395,7 @@ py_test(
     python_version = "PY3",
     srcs_version = "PY3",
     tags = [
+        "no_oss_py310",  # b/209089616
         "no_windows",
     ],
     deps = [
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index be6e396fc6cef3..01e9d4df0b3745 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1400,6 +1400,7 @@ distribute_py_test(
     name = "distributed_variable_test",
     size = "medium",
     srcs = ["distributed_variable_test.py"],
+    disable_v2 = True,  # TODO(b/209058825)
     main = "distributed_variable_test.py",
     shard_count = 3,
     tags = [
@@ -1539,6 +1540,7 @@ distribute_py_test(
     main = "distribute_utils_test.py",
     tags = [
         "multi_and_single_gpu",
+        "no_oss_py310",  # b/209158747
     ],
     deps = [
         ":combinations",
diff --git a/tensorflow/python/distribute/collective_util.py b/tensorflow/python/distribute/collective_util.py
index b2a91c05eda94b..f76bb833c0ed1a 100644
--- a/tensorflow/python/distribute/collective_util.py
+++ b/tensorflow/python/distribute/collective_util.py
@@ -92,8 +92,8 @@ def __init__(self,
     Args:
       bytes_per_pack: a non-negative integer. Breaks collective operations into
         packs of certain size. If it's zero, the value is determined
-        automatically. This only applies to all-reduce with
-        `MultiWorkerMirroredStrategy` currently.
+        automatically. This hint is respected by all multi-replica strategies
+        except `TPUStrategy`.
       timeout_seconds: a float or None, timeout in seconds. If not None, the
         collective raises `tf.errors.DeadlineExceededError` if it takes longer
         than this timeout. Zero disables timeout. This can be useful when
diff --git a/tensorflow/python/distribute/coordinator/BUILD b/tensorflow/python/distribute/coordinator/BUILD
index 9fdd25303d0d45..b677823288c901 100644
--- a/tensorflow/python/distribute/coordinator/BUILD
+++ b/tensorflow/python/distribute/coordinator/BUILD
@@ -68,6 +68,7 @@ distribute_py_test(
     tags = [
         "multi_gpu",
         "no_pip",
+        "noasan",  # TODO(b/171040359): Flaky timeout, even if maximum shards
         "notpu",
         "notsan",  # TODO(b/171040359): Flaky timeout, even if maximum shards
     ],
diff --git a/tensorflow/python/distribute/coordinator/cluster_coordinator.py b/tensorflow/python/distribute/coordinator/cluster_coordinator.py
index eb35d94cb967d4..c293a9ff05e728 100644
--- a/tensorflow/python/distribute/coordinator/cluster_coordinator.py
+++ b/tensorflow/python/distribute/coordinator/cluster_coordinator.py
@@ -329,7 +329,7 @@ def _on_watchdog_timeout(self):
   def stop(self):
     with self._queue_lock:
       self._should_process_closures = False
-      self._closures_queued_condition.notifyAll()
+      self._closures_queued_condition.notify_all()
     self._watchdog.stop()
 
   def _cancel_all_closures(self):
@@ -408,9 +408,9 @@ def mark_finished(self):
         raise AssertionError("There is no inflight closures to mark_finished.")
       self._inflight_closure_count -= 1
       if self._inflight_closure_count == 0:
-        self._no_inflight_closure_condition.notifyAll()
+        self._no_inflight_closure_condition.notify_all()
       if self._queue.empty() and self._inflight_closure_count == 0:
-        self._stop_waiting_condition.notifyAll()
+        self._stop_waiting_condition.notify_all()
       self._watchdog.report_closure_done()
 
   def put_back(self, closure):
@@ -426,7 +426,7 @@ def put_back(self, closure):
         self._closures_queued_condition.notify()
       self._inflight_closure_count -= 1
       if self._inflight_closure_count == 0:
-        self._no_inflight_closure_condition.notifyAll()
+        self._no_inflight_closure_condition.notify_all()
 
   def wait(self, timeout=None):
     """Wait for all closures to be finished before returning.
@@ -459,8 +459,8 @@ def mark_failed(self, e):
         self._error = e
       self._inflight_closure_count -= 1
       if self._inflight_closure_count == 0:
-        self._no_inflight_closure_condition.notifyAll()
-      self._stop_waiting_condition.notifyAll()
+        self._no_inflight_closure_condition.notify_all()
+      self._stop_waiting_condition.notify_all()
 
   def done(self):
     """Returns true if the queue is empty and there is no inflight closure.
diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py
index 44b6f63be6fcce..2edb1da3d80ca7 100644
--- a/tensorflow/python/distribute/input_lib_test.py
+++ b/tensorflow/python/distribute/input_lib_test.py
@@ -45,6 +45,7 @@
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util as framework_test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -1511,6 +1512,7 @@ def dataset_fn(ctx):
         input_context=distribution.extended._make_input_context())
 
 
+@framework_test_util.with_eager_op_as_function
 class DistributedIteratorPerDeviceTest(DistributedIteratorTestBase,
                                        parameterized.TestCase):
   """Tests for PER_WORKER and PER_REPLICA's InputOptions variants."""
diff --git a/tensorflow/python/distribute/random_generator_test.py b/tensorflow/python/distribute/random_generator_test.py
index 7763f5b449a5cd..6de3ab4ae7aac6 100644
--- a/tensorflow/python/distribute/random_generator_test.py
+++ b/tensorflow/python/distribute/random_generator_test.py
@@ -22,6 +22,7 @@
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.distribute import combinations as ds_combinations
 from tensorflow.python.distribute import multi_process_runner
+from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute.coordinator import cluster_coordinator as coordinator_lib
 from tensorflow.python.eager import def_function
@@ -49,18 +50,10 @@ def get_num_local_replicas(strat, values=None):
 
 
 ps_strategies = [
-    strategy_combinations.parameter_server_strategy_fn(
-        "ParameterServer3Worker2PSCPUNoShard",
-        num_workers=3, num_ps=2, variable_partitioner=None),
-    strategy_combinations.parameter_server_strategy_fn(
-        "ParameterServer1Worker2PSCPUNoShard",
-        num_workers=1, num_ps=2, variable_partitioner=None),
-    strategy_combinations.parameter_server_strategy_fn(
-        "ParameterServer3Worker2PS1GPUNoShard",
-        num_workers=3, num_ps=2, required_gpus=1, variable_partitioner=None),
-    strategy_combinations.parameter_server_strategy_fn(
-        "ParameterServer1Worker2PS1GPUNoShard",
-        num_workers=1, num_ps=2, required_gpus=1, variable_partitioner=None),
+    strategy_combinations.parameter_server_strategy_3worker_2ps_cpu,
+    strategy_combinations.parameter_server_strategy_1worker_2ps_cpu,
+    strategy_combinations.parameter_server_strategy_3worker_2ps_1gpu,
+    strategy_combinations.parameter_server_strategy_1worker_2ps_1gpu,
 ]
 all_strategies = (strategy_combinations.all_strategies +
                   strategy_combinations.multiworker_strategies +
@@ -156,7 +149,11 @@ def f():
   @ds_combinations.generate(
       combinations.combine(
           strat=[
-              strategy_combinations.parameter_server_strategy_1worker_2ps_cpu
+              strategy_combinations.parameter_server_strategy_fn(
+                  "ParameterServer1Worker2PSCPUFixedShards",
+                  num_workers=1, num_ps=2,
+                  variable_partitioner=(
+                      sharded_variable.FixedShardsPartitioner(2)))
           ],
           mode=["eager"]))
   def testShardedError(self, strat):
diff --git a/tensorflow/python/distribute/strategy_combinations.py b/tensorflow/python/distribute/strategy_combinations.py
index d2a02a16641d00..675d03e87ed269 100644
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@@ -205,9 +205,17 @@ def get_cluster_def(num_workers, num_ps):
   }
 
 
+# Due to b/195615322, FixedShardsPartitioner will wrongly partition
+# RNG state, so we use MinSizePartitioner as the default. Maximum RNG
+# state size is int64[3] which is 8 * 3 bytes, so we set
+# min_shard_bytes to 8 * 3 + 1.
+DEFAULT_PARTITIONER = sharded_variable.MinSizePartitioner(
+    min_shard_bytes=8 * 3 + 1, max_shards=2)
+
+
 def _get_ps_strategy_creator(
     num_workers, num_ps, required_gpus=0,
-    variable_partitioner=sharded_variable.FixedShardsPartitioner(2)):
+    variable_partitioner=DEFAULT_PARTITIONER):
 
   def _create_ps_strategy(resolver, variable_partitioner):
     return parameter_server_strategy_v2.ParameterServerStrategyV2(
@@ -447,7 +455,7 @@ def get_or_create():
 
 def parameter_server_strategy_fn(
     name, num_workers, num_ps, required_gpus=0,
-    variable_partitioner=sharded_variable.FixedShardsPartitioner(2)):
+    variable_partitioner=DEFAULT_PARTITIONER):
   return combinations.NamedDistribution(
       name,
       _get_ps_strategy_creator(
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 2488eebee4a2d4..92e988d45043d1 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -425,7 +425,10 @@ cuda_py_test(
     size = "small",
     srcs = ["core_test.py"],
     python_version = "PY3",
-    tags = ["notsan"],  # TODO(b/183962087)
+    tags = [
+        "no_oss_py310",  # TODO(b/209160947)
+        "notsan",  # TODO(b/183962087)
+    ],
     deps = [
         ":context",
         ":core",
@@ -1053,6 +1056,7 @@ tf_xla_py_test(
     python_version = "PY3",
     tags = [
         "no_mac",
+        "no_oss_py310",  # TODO(b/209163877)
         "no_pip",
         "no_tfrt",  # TODO(b/185944215)
         "no_windows",
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 854e95e1d39b73..32c290f0418ed7 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -3158,14 +3158,14 @@ def _define_function_with_shape_relaxation(self, args, kwargs, flat_args,
     # Build a cache key where TensorShapes include only rank information (and
     # not information about the size of each dimension).
     if not any_composite_args:
-      rank_only_cache_key = function_cache.make_cache_key(
+      rank_only_cache_key, _ = function_cache.make_cache_key(
           (args, kwargs), include_tensor_ranks_only=True)
     else:
       # For the rank-only cache key, replace any composite tensors with
       # shape-relaxed TypeSpecs.
       relaxed_args = nest.map_structure(
           _shape_relaxed_type_for_composite_tensor, (args, kwargs))
-      rank_only_cache_key = function_cache.make_cache_key(
+      rank_only_cache_key, _ = function_cache.make_cache_key(
           relaxed_args, include_tensor_ranks_only=True)
 
     arg_specs = [_type_spec_for(x) for x in flat_no_comp]
@@ -3246,9 +3246,11 @@ def _maybe_define_function(self, args, kwargs):
       flat_args, filtered_flat_args = [None], []
 
     if self.input_signature is None:
-      cache_key = function_cache.make_cache_key((args, kwargs))
+      cache_key, cache_key_deletion_observer = function_cache.make_cache_key(
+          (args, kwargs))
     else:
-      cache_key = function_cache.make_cache_key(self.flat_input_signature)
+      cache_key, cache_key_deletion_observer = function_cache.make_cache_key(
+          self.flat_input_signature)
 
     try:
       hash(cache_key)
@@ -3288,7 +3290,8 @@ def _maybe_define_function(self, args, kwargs):
 
           self._function_cache.add_call_context(cache_key.call_context)
           graph_function = self._create_graph_function(args, kwargs)
-          self._function_cache.add(cache_key, graph_function)
+          self._function_cache.add(cache_key, cache_key_deletion_observer,
+                                   graph_function)
 
           return graph_function, filtered_flat_args
 
diff --git a/tensorflow/python/eager/function_cache.py b/tensorflow/python/eager/function_cache.py
index a6dabcd364da88..276c5bb68a3e40 100644
--- a/tensorflow/python/eager/function_cache.py
+++ b/tensorflow/python/eager/function_cache.py
@@ -15,7 +15,7 @@
 """Cache to manage concrete functions and their signatures."""
 
 import collections
-from typing import Sequence, Optional
+from typing import Sequence, Optional, Tuple
 
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function_trace_type
@@ -35,6 +35,8 @@
 _ENCODE_VARIABLES_BY_RESOURCE_ID = True
 # TODO(b/201533914): Remove this flag and related args
 USE_FULL_TRACE_TYPE = True
+# TODO(b/182990542): Enable and remove flag when stable.
+DELETE_WITH_WEAKREF = False
 
 ExecutionContext = collections.namedtuple("ExecutionContext", [
     "parent_graph",
@@ -50,12 +52,14 @@ class FunctionCacheKey(trace.TraceType):
   """The unique key associated with a concrete function.
 
   Attributes:
-    arg_spec: A TraceType corresponding to the function arguments.
-    call_context: The ExecutionContext for when the arg_spec was generated.
+    function_signature: A TraceType corresponding to the function arguments.
+    call_context: The ExecutionContext for when the function_signature was
+      generated.
   """
 
-  def __init__(self, arg_spec: trace.TraceType, call_context: ExecutionContext):
-    self.arg_spec = arg_spec
+  def __init__(self, function_signature: trace.TraceType,
+               call_context: ExecutionContext):
+    self.function_signature = function_signature
     self.call_context = call_context
 
   def is_subtype_of(self, other: trace.TraceType) -> bool:
@@ -66,7 +70,7 @@ def is_subtype_of(self, other: trace.TraceType) -> bool:
       return False
 
     # Functions are contravariant.
-    return other.arg_spec.is_subtype_of(self.arg_spec)
+    return other.function_signature.is_subtype_of(self.function_signature)
 
   def most_specific_common_supertype(
       self, others: Sequence[trace.TraceType]) -> Optional["FunctionCacheKey"]:
@@ -80,8 +84,8 @@ def most_specific_common_subtype(
         self.call_context == other.call_context for other in others):
       return None
 
-    common = self.arg_spec.most_specific_common_supertype(
-        [other.arg_spec for other in others])
+    common = self.function_signature.most_specific_common_supertype(
+        [other.function_signature for other in others])
 
     if common is None:
       return None
@@ -89,7 +93,7 @@ def most_specific_common_subtype(
     return FunctionCacheKey(common, self.call_context)
 
   def __hash__(self) -> int:
-    return hash((self.call_context, self.arg_spec))
+    return hash((self.call_context, self.function_signature))
 
   def __eq__(self, other) -> bool:
     if not isinstance(other, trace.TraceType):
@@ -99,11 +103,12 @@ def __eq__(self, other) -> bool:
       return False
 
     return (self.call_context == other.call_context and
-            self.arg_spec == other.arg_spec)
+            self.function_signature == other.function_signature)
 
   def __repr__(self) -> str:
-    return (f"{type(self).__name__}(arg_spec={repr(self.arg_spec)},"
-            f" call_context={repr(self.call_context)})")
+    return (
+        f"{type(self).__name__}(function_signature={repr(self.function_signature)},"
+        f" call_context={repr(self.call_context)})")
 
 
 class FunctionCache:
@@ -140,7 +145,8 @@ def __init__(self):
         _FunctionGarbageCollector(self._primary),
         _FunctionGarbageCollector(self._dispatch_cache),
         _FunctionGarbageCollector(self.arg_relaxed),
-        _FunctionGarbageCollector(self.arg_relaxed_specs)]
+        _FunctionGarbageCollector(self.arg_relaxed_specs)
+    ]
 
   # Note: Instead of returning any viable function, we can return the most
   # specfic one by maintaining trees of traces where children are more specific
@@ -167,15 +173,30 @@ def lookup(self, key: FunctionCacheKey, use_function_subtyping: bool):
   # self._dispatch_cache.
   def delete(self, key: FunctionCacheKey):
     """Deletes a concrete function given the key it was added with."""
+    if key not in self._primary:
+      return False
+
     del self._primary[key]
 
     for dispatched_key in self._dispatch_cache:
       if self._dispatch_cache[dispatched_key] == key:
         del self._dispatch_cache[dispatched_key]
 
-  def add(self, key: FunctionCacheKey, concrete):
-    """Adds a new concrete function alongside its key."""
+    return True
+
+  def add(self, key: FunctionCacheKey,
+          deletion_observer: function_trace_type.WeakrefDeletionObserver,
+          concrete):
+    """Adds a new concrete function alongside its key.
+
+    Args:
+      key: A FunctionCacheKey object corresponding to the provided `concrete`.
+      deletion_observer: A WeakrefDeletionObserver object for the `key`.
+      concrete: The concrete function to be added to the cache.
+    """
     self._primary[key] = concrete
+    deletion_observer.add_listener(
+        lambda: self.delete(key) if DELETE_WITH_WEAKREF else None)
 
   def clear(self):
     """Removes all concrete functions from the cache."""
@@ -195,7 +216,8 @@ def values(self):
     # arguments. If and when that is implemented, this logic can be revisited.
     primary_functions = set(self._primary.values())
     return list(self._primary.values()) + [
-        v for v in self.arg_relaxed.values() if v not in primary_functions]
+        v for v in self.arg_relaxed.values() if v not in primary_functions
+    ]
 
   def has_call_context(self, call_context: ExecutionContext) -> bool:
     """Checks if an ExcutionContext was observed."""
@@ -225,13 +247,19 @@ def __del__(self):
       pass
 
 
-def make_cache_key(args,
-                   include_tensor_ranks_only: bool = False) -> FunctionCacheKey:
+def make_cache_key(
+    args,
+    include_tensor_ranks_only: bool = False
+) -> Tuple[FunctionCacheKey, function_trace_type.WeakrefDeletionObserver]:
   """Computes the cache key given the function arguments."""
-  arg_spec = function_trace_type.get_arg_spec(
-      args, include_tensor_ranks_only, _ENCODE_VARIABLES_BY_RESOURCE_ID,
+  signature_context = function_trace_type.SignatureContext(
+      include_tensor_ranks_only)
+  function_signature = function_trace_type.make_function_signature(
+      args, signature_context, _ENCODE_VARIABLES_BY_RESOURCE_ID,
       USE_FULL_TRACE_TYPE)
-  return FunctionCacheKey(arg_spec, _make_execution_context())
+  return FunctionCacheKey(
+      function_signature,
+      _make_execution_context()), signature_context.deletion_observer
 
 
 def _make_execution_context() -> ExecutionContext:
@@ -265,8 +293,7 @@ def _make_execution_context() -> ExecutionContext:
   strategy_stack = default_graph._distribution_strategy_stack
   uses_distribution_strategy = (
       strategy_stack and
-      strategy_stack[-1].strategy.extended._retrace_functions_for_each_device
-  )
+      strategy_stack[-1].strategy.extended._retrace_functions_for_each_device)
   if executing_eagerly:
     colocation_stack = ()
     if uses_distribution_strategy:
@@ -275,8 +302,8 @@ def _make_execution_context() -> ExecutionContext:
       device_functions = ()
   else:
     colocation_stack = tuple(default_graph._colocation_stack.peek_objs())
-    if (uses_distribution_strategy
-        or func_graph_module.device_stack_has_callable(
+    if (uses_distribution_strategy or
+        func_graph_module.device_stack_has_callable(
             default_graph._device_function_stack)):
       # Putting the device in the cache key ensures that call-site device
       # annotations are respected.
diff --git a/tensorflow/python/eager/function_cache_test.py b/tensorflow/python/eager/function_cache_test.py
index 57e29da9b977b0..751adaaf9de0d5 100644
--- a/tensorflow/python/eager/function_cache_test.py
+++ b/tensorflow/python/eager/function_cache_test.py
@@ -22,6 +22,11 @@
 from tensorflow.python.platform import test
 
 
+class DummyClass:
+  """Helps test Weakref deletion."""
+  pass
+
+
 class MockSubtypeOf2(function_trace_type.GenericType):
 
   def is_subtype_of(self, other):
@@ -42,14 +47,14 @@ class FunctionCacheTest(test.TestCase):
   def testConcreteFunctionDictRetainsInsertedKeys(self):
     cache = function_cache.FunctionCache()
 
-    key_1 = function_cache.make_cache_key(1)
+    key_1, deletion_observer_1 = function_cache.make_cache_key(1)
     self.assertIsNone(cache.lookup(key_1, False))
 
-    key_2 = function_cache.make_cache_key(2)
-    key_3 = function_cache.make_cache_key(3)
+    key_2, deletion_observer_2 = function_cache.make_cache_key(2)
+    key_3, _ = function_cache.make_cache_key(3)
 
-    cache.add(key_1, "test_1")
-    cache.add(key_2, "test_2")
+    cache.add(key_1, deletion_observer_1, "test_1")
+    cache.add(key_2, deletion_observer_2, "test_2")
 
     self.assertEqual(cache.lookup(key_1, False), "test_1")
     self.assertEqual(cache.lookup(key_2, False), "test_2")
@@ -58,12 +63,12 @@ def testConcreteFunctionDictRetainsInsertedKeys(self):
   def testClearRemovesAllConcreteFunctions(self):
     cache = function_cache.FunctionCache()
 
-    key_1 = function_cache.make_cache_key(1)
-    key_2 = function_cache.make_cache_key(2)
-    key_3 = function_cache.make_cache_key(3)
+    key_1, deletion_observer_1 = function_cache.make_cache_key(1)
+    key_2, deletion_observer_2 = function_cache.make_cache_key(2)
+    key_3, _ = function_cache.make_cache_key(3)
 
-    cache.add(key_1, "test_1")
-    cache.add(key_2, "test_2")
+    cache.add(key_1, deletion_observer_1, "test_1")
+    cache.add(key_2, deletion_observer_2, "test_2")
 
     self.assertEqual(cache.lookup(key_1, False), "test_1")
     self.assertEqual(cache.lookup(key_2, False), "test_2")
@@ -77,17 +82,18 @@ def testClearRemovesAllConcreteFunctions(self):
 
   def testDeleteRemovesConcreteFunctions(self):
     cache = function_cache.FunctionCache()
-    key_1 = function_cache.make_cache_key(1)
-    cache.add(key_1, "test_1")
+    key_1, deletion_observer_1 = function_cache.make_cache_key(1)
+    cache.add(key_1, deletion_observer_1, "test_1")
     self.assertEqual(cache.lookup(key_1, False), "test_1")
     cache.delete(key_1)
     self.assertIsNone(cache.lookup(key_1, False))
 
-    key_2 = MockSubtypeOf2(3)
-    cache.add(key_2, "test_2")
+    key_2 = function_cache.FunctionCacheKey(MockSubtypeOf2(2), None)
+    cache.add(key_2, function_trace_type.WeakrefDeletionObserver(),
+              "test_2")
     self.assertEqual(cache.lookup(key_2, False), "test_2")
 
-    key_3 = MockSubtypeOf2(2)
+    key_3 = function_cache.FunctionCacheKey(MockSubtypeOf2(3), None)
     self.assertEqual(cache.lookup(key_3, True), "test_2")
 
     cache.delete(key_2)
@@ -141,6 +147,54 @@ def testFunctionCacheKeyRespectsSupertype(self):
         function_cache.FunctionCacheKey(MockSupertypes2With3(3), ctx))
     self.assertIsNone(key_a.most_specific_common_subtype([key_b]))
 
+  def testWeakRefDeletionAlsoDeletesConcreteFunction(self):
+    if not function_cache.DELETE_WITH_WEAKREF:
+      self.skipTest("Weakref-Based Deletion is disabled")
+
+    dummy_object = DummyClass()
+    key, deletion_observer = function_cache.make_cache_key(dummy_object)
+
+    cache = function_cache.FunctionCache()
+    cache.add(key, deletion_observer, "testing")
+    self.assertEqual(cache.lookup(key, False), "testing")
+
+    del dummy_object
+    self.assertIsNone(cache.lookup(key, False))
+
+  def testMultipleObjectsWeakRefDeletion(self):
+    if not function_cache.DELETE_WITH_WEAKREF:
+      self.skipTest("Weakref-Based Deletion is disabled")
+
+    dummy_object_1 = DummyClass()
+    dummy_object_2 = DummyClass()
+    key, deletion_observer = function_cache.make_cache_key(
+        (dummy_object_1, dummy_object_2))
+
+    cache = function_cache.FunctionCache()
+    cache.add(key, deletion_observer, "testing")
+    self.assertEqual(cache.lookup(key, False), "testing")
+
+    del dummy_object_1
+    self.assertIsNone(cache.lookup(key, False))
+
+    del dummy_object_2
+    self.assertIsNone(cache.lookup(key, False))
+
+  def testObjectDeletedDuringFunctionCallDoesntAddConcreteFunction(self):
+    if not function_cache.DELETE_WITH_WEAKREF:
+      self.skipTest("Weakref-Based Deletion is disabled")
+
+    def second(o):
+      return function_cache.make_cache_key(o)
+
+    def first():
+      return second(DummyClass())
+
+    key, deletion_observer = first()
+    cache = function_cache.FunctionCache()
+    cache.add(key, deletion_observer, "testing")
+    self.assertIsNone(cache.lookup(key, False))
+
 
 class FunctionCacheBenchmark(test.Benchmark):
 
@@ -160,7 +214,7 @@ def benchmarkCacheHit50thKeyMiss(self):
       keys.append(function_cache.make_cache_key(args))
 
     for key in keys[:-1]:
-      cache.add(key, "testing")
+      cache.add(*key, "testing")
 
     iterations = 10000
     subtyping_time = timeit.timeit(
@@ -199,7 +253,7 @@ def benchmarkCacheHit50thKeyEqual(self):
       keys.append(function_cache.make_cache_key(args))
 
     for key in keys:
-      cache.add(key, "testing")
+      cache.add(*key, "testing")
 
     iterations = 10000
     subtyping_time = timeit.timeit(
@@ -231,20 +285,23 @@ def benchmarkCacheHit50thKeyKnownSubtype(self):
     num_total_checks = 50
 
     keys = []
-    for i in range(num_total_checks-1):
+    for i in range(num_total_checks - 1):
       args = []
       for j in range(args_per_call):
         args.append(array_ops.zeros([i, j]))
       keys.append(function_cache.make_cache_key(args))
 
     for key in keys:
-      cache.add(key, "testing")
-    cache.add(MockSubtypeOf2(3), "testing")
-    cache.lookup(MockSubtypeOf2(2), True)
+      cache.add(*key, "testing")
+    cache.add(
+        function_cache.FunctionCacheKey(MockSubtypeOf2(2), None),
+        function_trace_type.WeakrefDeletionObserver(), "testing")
+    cache.lookup(function_cache.FunctionCacheKey(MockSubtypeOf2(3), None), True)
 
     iterations = 10000
+    lookup_key = function_cache.FunctionCacheKey(MockSubtypeOf2(2), None)
     subtyping_time = timeit.timeit(
-        lambda: cache.lookup(MockSubtypeOf2(2), True), number=iterations)
+        lambda: cache.lookup(lookup_key, True), number=iterations)
 
     self.report_benchmark(
         name="cache_hit_50th_key_known_subtype",
@@ -264,7 +321,7 @@ def benchmarkCacheHit50thKeyUnknownSubtype(self):
     num_total_checks = 50
 
     keys = []
-    for i in range(num_total_checks-1):
+    for i in range(num_total_checks - 1):
       args = []
       for j in range(args_per_call):
         args.append(array_ops.zeros([i, j]))
@@ -273,15 +330,19 @@ def benchmarkCacheHit50thKeyUnknownSubtype(self):
     def setup():
       cache.clear()
       for key in keys:
-        cache.add(key, "testing")
-      cache.add(MockSubtypeOf2(3), "testing")
+        cache.add(*key, "testing")
+      cache.add(
+          function_cache.FunctionCacheKey(MockSubtypeOf2(3), None),
+          function_trace_type.WeakrefDeletionObserver(), "testing")
 
     iterations = 10000
-    subtyping_time = sum(timeit.repeat(
-        stmt=lambda: cache.lookup(MockSubtypeOf2(2), True),
-        setup=setup,
-        repeat=iterations,
-        number=1))
+    lookup_key = function_cache.FunctionCacheKey(MockSubtypeOf2(2), None)
+    subtyping_time = sum(
+        timeit.repeat(
+            stmt=lambda: cache.lookup(lookup_key, True),
+            setup=setup,
+            repeat=iterations,
+            number=1))
 
     self.report_benchmark(
         name="cache_hit_50th_key_unknown_subtype",
@@ -292,5 +353,6 @@ def setup():
             "value": subtyping_time / iterations * 1000
         }])
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index be3da584f1f62d..740d05444816a3 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -1949,14 +1949,15 @@ def __hash__(self):
         return 42
 
     def func(foo):
-      del foo
-      return
+      return constant_op.constant([id(foo)])
 
     defined = function.defun(func)
-    defined(Foo())
+    foo_1 = Foo()
+    defined(foo_1)
     self.assertLen(total_function_cache(defined), 1)
 
-    defined(Foo())
+    foo_2 = Foo()
+    defined(foo_2)
     self.assertLen(total_function_cache(defined), 2)
 
   def testCacheTensorDtypeCollision(self):
diff --git a/tensorflow/python/eager/function_trace_type.py b/tensorflow/python/eager/function_trace_type.py
index 19e35d84543981..15b27064515d53 100644
--- a/tensorflow/python/eager/function_trace_type.py
+++ b/tensorflow/python/eager/function_trace_type.py
@@ -14,7 +14,7 @@
 # ==============================================================================
 """Utitiles for Cache Key generation based on Function Trace Type."""
 
-from typing import Dict, Hashable, Optional, Sequence, Tuple, Type
+from typing import Dict, Hashable, Optional, Sequence, Tuple, Type, Callable
 
 from tensorflow.python import pywrap_tfe
 from tensorflow.python.eager import core
@@ -23,10 +23,43 @@
 from tensorflow.python.util import _pywrap_utils
 
 
+class WeakrefDeletionObserver:
+  """An observer for the event of deleting a weakref.
+
+  This allows users of FunctionTraceType to be notified when an instance which
+  depends on a weakref becomes invalid by the deletion of the weakref. In
+  particular, tf.function caches can use this mechanism to clear the cache of
+  keys that are no longer valid.
+
+  We use the observer pattern and not just basic callbacks because the keys
+  are typically created before they are used by the cache.
+  """
+
+  def __init__(self):
+    self._triggered = False
+    self._callables = []
+
+  def add_listener(self, on_delete: Callable[[], None]):
+    if self._triggered:
+      on_delete()
+    else:
+      self._callables.append(on_delete)
+
+  def weakref_deleted(self):
+    self._triggered = True
+    for c in self._callables:
+      c()
+
+  def __call__(self, _):
+    """Call handler for convenience of use with weakref."""
+    self.weakref_deleted()
+
+
 class SignatureContext(trace.TracingContext):
   """Container for variables and flags shared across signature tracing."""
 
   def __init__(self, include_tensor_ranks_only=False):
+    self._deletion_observer = WeakrefDeletionObserver()
     self._include_tensor_ranks_only = include_tensor_ranks_only
     self._global_to_local_id = {}
 
@@ -44,6 +77,11 @@ def get_local_id(self, local_id):
   def include_tensor_ranks_only(self):
     return self._include_tensor_ranks_only
 
+  @property
+  def deletion_observer(self):
+    """Returns a functor which invalidates the current key when called."""
+    return self._deletion_observer
+
 
 class GenericType(trace.TraceType):
   """Represents an arbitrary Python object."""
@@ -77,10 +115,12 @@ def __repr__(self):
     return f"{self.__class__.__name__}(obj={self._object!r})"
 
 
-# TODO(b/182990542): Trigger function cache to remove associated concrete
-# function at the deletion of referrant.
 class WeakrefType(GenericType):
-  """Represents weakref of an arbitrary Python object."""
+  """Represents weakref of an arbitrary Python object.
+
+  When a function argument is a custom class, instead of making a copy of it
+  just for the sake of function cache, a weakref is instead kept to save memory.
+  """
 
   def __eq__(self, other):
     if not isinstance(other, trace.TraceType):
@@ -271,27 +311,29 @@ def __repr__(self):
 _pywrap_utils.RegisterType("DictType", DictType)
 
 
-def get_arg_spec(inputs, include_tensor_ranks_only,
-                 encode_variables_by_resource_id, use_full_trace_type):
+def make_function_signature(
+    function_args,
+    signature_context: SignatureContext,
+    encode_variables_by_resource_id,
+    use_full_trace_type) -> trace.TraceType:
   """Returns the trace type specification of a function's arguments.
 
   Args:
-    inputs: Tuple/List/Dict structure containing the function arguments
-    include_tensor_ranks_only: If Tensors should be considered by rank
+    function_args: Tuple/List/Dict structure containing the function arguments
+    signature_context: The SignatureContext to be shared during protocol calls.
     encode_variables_by_resource_id: If Variables should be considered by
       resource id
     use_full_trace_type: Uses the TraceType protocol wherever possible.
 
   Returns:
-    A TraceType object representing the function arguments.
+    A TraceType object representing all the given inputs.
   """
 
-  signature_context = SignatureContext(include_tensor_ranks_only)
   try:
-    encoding = pywrap_tfe.TFE_Py_EncodeArg(inputs, signature_context,
-                                           include_tensor_ranks_only,
-                                           encode_variables_by_resource_id,
-                                           use_full_trace_type)
+    encoding = pywrap_tfe.TFE_Py_EncodeArg(
+        function_args, signature_context,
+        signature_context.include_tensor_ranks_only,
+        encode_variables_by_resource_id, use_full_trace_type)
     if use_full_trace_type:
       return encoding
     else:
diff --git a/tensorflow/python/eager/function_trace_type_test.py b/tensorflow/python/eager/function_trace_type_test.py
index 698ce7ac551854..aba448925a39e9 100644
--- a/tensorflow/python/eager/function_trace_type_test.py
+++ b/tensorflow/python/eager/function_trace_type_test.py
@@ -57,6 +57,11 @@ class DummyGenericClass:
   pass
 
 
+def make_function_signature_with_context(inputs):
+  return function_trace_type.make_function_signature(
+      inputs, function_trace_type.SignatureContext(), True, True)
+
+
 class CacheKeyGenerationTest(test.TestCase, parameterized.TestCase):
 
   @combinations.generate(combinations.combine(mode=['eager']))
@@ -65,14 +70,14 @@ def testIteratorAliasing(self):
     it2 = iter(dataset_ops.DatasetV2.from_tensor_slices([1, 2, 3]))
 
     self.assertEqual(
-        function_trace_type.get_arg_spec((it1, it1), False, False, True),
-        function_trace_type.get_arg_spec((it2, it2), False, False, True))
+        make_function_signature_with_context((it1, it1)),
+        make_function_signature_with_context((it2, it2)))
     self.assertEqual(
-        function_trace_type.get_arg_spec((it1, it2), False, False, True),
-        function_trace_type.get_arg_spec((it2, it1), False, False, True))
+        make_function_signature_with_context((it1, it2)),
+        make_function_signature_with_context((it2, it1)))
     self.assertNotEqual(
-        function_trace_type.get_arg_spec((it1, it1), False, False, True),
-        function_trace_type.get_arg_spec((it1, it2), False, False, True))
+        make_function_signature_with_context((it1, it1)),
+        make_function_signature_with_context((it1, it2)))
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testIteratorTypesImplementTracing(self):
@@ -88,26 +93,23 @@ def testCompositeAndSpec(self):
     spec = ragged_tensor.RaggedTensorSpec([2, None], dtypes.int32)
 
     self.assertEqual(
-        function_trace_type.get_arg_spec(composite_tensor, False, False, True),
-        function_trace_type.get_arg_spec(spec, False, False, True))
+        make_function_signature_with_context(composite_tensor),
+        make_function_signature_with_context(spec))
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testVariableAliasing(self):
     v1 = resource_variable_ops.ResourceVariable([1])
     v2 = resource_variable_ops.ResourceVariable([1])
     v3 = resource_variable_ops.ResourceVariable([1])
-    all_unique = function_trace_type.get_arg_spec((v1, v2, v3), False, True,
-                                                  True)
-    all_same = function_trace_type.get_arg_spec((v1, v1, v1), False, True, True)
+    all_unique = make_function_signature_with_context((v1, v2, v3))
+    all_same = make_function_signature_with_context((v1, v1, v1))
     self.assertNotEqual(all_unique, all_same)
 
     v3 = resource_variable_ops.ResourceVariable([2])
     v4 = resource_variable_ops.ResourceVariable([2])
     v5 = resource_variable_ops.ResourceVariable([2])
-    all_unique_again = function_trace_type.get_arg_spec((v3, v4, v5), False,
-                                                        True, True)
-    all_same_again = function_trace_type.get_arg_spec((v4, v4, v4), False, True,
-                                                      True)
+    all_unique_again = make_function_signature_with_context((v3, v4, v5))
+    all_same_again = make_function_signature_with_context((v4, v4, v4))
     self.assertEqual(all_unique, all_unique_again)
     self.assertEqual(all_same, all_same_again)
 
@@ -155,9 +157,7 @@ def testTensorShapeUnknown(self):
   def testAttrsCacheKeyGeneration(self):
     if attr is None:
       self.skipTest('attr module is unavailable.')
-
-    trace_a = function_trace_type.get_arg_spec(
-        TestAttrsClass(1, 2), False, False, True)
+    trace_a = make_function_signature_with_context(TestAttrsClass(1, 2))
     expected = function_trace_type.AttrsType(
         TestAttrsClass, (function_trace_type.GenericType(1),
                          function_trace_type.GenericType(2)))
@@ -166,11 +166,10 @@ def testAttrsCacheKeyGeneration(self):
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testTupleEquality(self):
-    trace_a = function_trace_type.get_arg_spec((1, 2, 3, 4), False, False, True)
-    trace_b = function_trace_type.get_arg_spec((1, 2, 2, 4), False, False, True)
-    trace_c = function_trace_type.get_arg_spec((1, 2, 3), False, False, True)
-    trace_d = function_trace_type.get_arg_spec((1, 2, 3, 4), False, False, True)
-
+    trace_a = make_function_signature_with_context((1, 2, 3, 4))
+    trace_b = make_function_signature_with_context((1, 2, 2, 4))
+    trace_c = make_function_signature_with_context((1, 2, 3))
+    trace_d = make_function_signature_with_context((1, 2, 3, 4))
     self.assertNotEqual(trace_a, trace_b)
     self.assertNotEqual(trace_a, trace_c)
     self.assertNotEqual(trace_b, trace_c)
@@ -178,11 +177,10 @@ def testTupleEquality(self):
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testListEquality(self):
-    trace_a = function_trace_type.get_arg_spec([1, 2, 3, 4], False, False, True)
-    trace_b = function_trace_type.get_arg_spec([1, 2, 2, 4], False, False, True)
-    trace_c = function_trace_type.get_arg_spec([1, 2, 3], False, False, True)
-    trace_d = function_trace_type.get_arg_spec([1, 2, 3, 4], False, False, True)
-
+    trace_a = make_function_signature_with_context([1, 2, 3, 4])
+    trace_b = make_function_signature_with_context([1, 2, 2, 4])
+    trace_c = make_function_signature_with_context([1, 2, 3])
+    trace_d = make_function_signature_with_context([1, 2, 3, 4])
     self.assertNotEqual(trace_a, trace_b)
     self.assertNotEqual(trace_a, trace_c)
     self.assertNotEqual(trace_b, trace_c)
@@ -190,11 +188,10 @@ def testListEquality(self):
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testDictEquality(self):
-    trace_a = function_trace_type.get_arg_spec({1: 2, 3: 4}, False, False, True)
-    trace_b = function_trace_type.get_arg_spec({1: 2, 3: 2}, False, False, True)
-    trace_c = function_trace_type.get_arg_spec({1: 2, 3: 0}, False, False, True)
-    trace_d = function_trace_type.get_arg_spec({3: 4, 1: 2}, False, False, True)
-
+    trace_a = make_function_signature_with_context({1: 2, 3: 4})
+    trace_b = make_function_signature_with_context({1: 2, 3: 2})
+    trace_c = make_function_signature_with_context({1: 2, 3: 0})
+    trace_d = make_function_signature_with_context({3: 4, 1: 2})
     self.assertNotEqual(trace_a, trace_b)
     self.assertNotEqual(trace_a, trace_c)
     self.assertNotEqual(trace_b, trace_c)
@@ -203,8 +200,8 @@ def testDictEquality(self):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def testComplexStruct(self):
     struct = {(1, 2, 3): {(1, 2): {12: 2}}, (3, 2, 3): (2, {2: 3})}
-    trace_a = function_trace_type.get_arg_spec(struct, False, False, True)
-    trace_b = function_trace_type.get_arg_spec(struct, False, False, True)
+    trace_a = make_function_signature_with_context(struct)
+    trace_b = make_function_signature_with_context(struct)
     self.assertEqual(trace_a, trace_b)
     self.assertTrue(trace_a.is_subtype_of(trace_b))
     self.assertTrue(trace_b.is_subtype_of(trace_a))
@@ -214,32 +211,31 @@ class CacheKeyMemoryTest(test.TestCase):
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def testGeneric(self):
-    function_trace_type.get_arg_spec(1, False, True, True)
-    function_trace_type.get_arg_spec(DummyGenericClass(), False, True, True)
+    make_function_signature_with_context(1)
+    make_function_signature_with_context(DummyGenericClass())
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def testTensor(self):
     tensor = array_ops.zeros([10])
-    function_trace_type.get_arg_spec(tensor, False, True, True)
+    make_function_signature_with_context(tensor)
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def testTuple(self):
-    function_trace_type.get_arg_spec((1, 2, 3), False, True, True)
+    make_function_signature_with_context((1, 2, 3))
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def testDict(self):
-    function_trace_type.get_arg_spec({1: 1, 2: 2, 3: 3}, False, True, True)
+    make_function_signature_with_context({1: 1, 2: 2, 3: 3})
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def testList(self):
-    function_trace_type.get_arg_spec([1, 2, 3], False, True, True)
+    make_function_signature_with_context([1, 2, 3])
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def testAttrs(self):
     if attr is None:
       self.skipTest('attr module is unavailable.')
-
-    function_trace_type.get_arg_spec(TestAttrsClass(1, 2), False, True, True)
+    make_function_signature_with_context(TestAttrsClass(1, 2))
 
 
 class CacheKeyGenerationBenchmark(test.Benchmark):
@@ -251,7 +247,7 @@ def benchmarkTensor(self):
       tensors.append(array_ops.zeros(s))
 
     def encode_tensors(tensors):
-      function_trace_type.get_arg_spec(tensors, False, False, True)
+      make_function_signature_with_context(tensors)
 
     iterations = 100000
     t = timeit.timeit(lambda: encode_tensors(tensors), number=iterations)
@@ -271,7 +267,7 @@ def benchmarkTensorSpec(self):
       tensor_specs.append(tensor_spec.TensorSpec(s, dtypes.int32))
 
     def encode_tensor_specs(tensor_specs):
-      function_trace_type.get_arg_spec(tensor_specs, False, False, True)
+      make_function_signature_with_context(tensor_specs)
 
     iterations = 100000
     t = timeit.timeit(
@@ -293,7 +289,7 @@ def benchmarkVariable(self):
     ]
 
     def encode_variables(var_list):
-      function_trace_type.get_arg_spec(var_list, False, False, True)
+      make_function_signature_with_context(var_list)
 
     iterations = 10000
     t = timeit.timeit(lambda: encode_variables(var_list), number=iterations)
@@ -313,7 +309,7 @@ def benchmarkKerasModel(self):
     model = keras.Model(inputs=inputs, outputs=outputs)
 
     def encode_model(model):
-      function_trace_type.get_arg_spec(model, False, False, True)
+      make_function_signature_with_context(model)
 
     iterations = 100000
     t = timeit.timeit(lambda: encode_model(model), number=iterations)
@@ -360,7 +356,7 @@ def benchmarkNestedStruct(self):
     struct = {(1, 2, 3): {(1, 2): {12: 2}}, (3, 2, 3): (2, {2: 3})}
 
     def encode_struct(struct):
-      function_trace_type.get_arg_spec(struct, False, False, True)
+      make_function_signature_with_context(struct)
 
     iterations = 100000
     t = timeit.timeit(lambda: encode_struct(struct), number=iterations)
@@ -411,11 +407,9 @@ def __hash__(self):
 
     object_a = CustomUnequable()
     object_b = CustomUnequable()
-
-    trace_a_1 = function_trace_type.get_arg_spec(object_a, False, True, True)
-    trace_a_2 = function_trace_type.get_arg_spec(object_a, False, True, True)
-    trace_b = function_trace_type.get_arg_spec(object_b, False, True, True)
-
+    trace_a_1 = make_function_signature_with_context(object_a)
+    trace_a_2 = make_function_signature_with_context(object_a)
+    trace_b = make_function_signature_with_context(object_b)
     self.assertEqual(trace_a_1, trace_a_2)
 
     with self.assertRaises(ValueError):
@@ -440,7 +434,7 @@ def __eq__(self, o):
     with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         r'could not be represented through the generic tracing type'):
-      function_trace_type.get_arg_spec(obj, False, True, True)
+      make_function_signature_with_context(obj)
 
   def testOrderedCollectionTypeEquality(self):
     collection = function_trace_type.OrderedCollectionType
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 21941579062f5d..fc48d07218ecc6 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -4394,9 +4394,15 @@ tensorflow::StatusOr<PyObject*> MakeAttrsType(PyObject* object,
       tensorflow::swig::GetRegisteredPyObject("AttrsType"), call_args.get());
 }
 
-tensorflow::StatusOr<PyObject*> EncodeGenericObject(PyObject* object) {
-  tensorflow::Safe_PyObjectPtr ref(PyWeakref_NewRef(object, nullptr));
+tensorflow::StatusOr<PyObject*> EncodeGenericObject(PyObject* object,
+                                                    PyObject* context) {
   std::string type_name = "WeakrefType";
+
+  tensorflow::Safe_PyObjectPtr deletion_observer(
+      PyObject_GetAttrString(context, "deletion_observer"));
+  tensorflow::Safe_PyObjectPtr ref(
+      PyWeakref_NewRef(object, deletion_observer.get()));
+
   if (ref == nullptr) {
     // Happens if the type can not be weakly referenceed (such as int).
     // https://docs.python.org/3/library/weakref.html
@@ -4444,7 +4450,7 @@ tensorflow::StatusOr<PyObject*> EncodeTraceType(PyObject* object,
     return MakeAttrsType(object, context);
   }
 
-  return EncodeGenericObject(object);
+  return EncodeGenericObject(object, context);
 }
 
 tensorflow::Status EncodeArgLegacy(PyObject* arg, EncodingContext& context) {
diff --git a/tensorflow/python/eager/pywrap_tfe_test.py b/tensorflow/python/eager/pywrap_tfe_test.py
index dcadba4d2ed554..9e23e0651bdac7 100644
--- a/tensorflow/python/eager/pywrap_tfe_test.py
+++ b/tensorflow/python/eager/pywrap_tfe_test.py
@@ -37,6 +37,7 @@
 from tensorflow.python.ops import resource_variable_ops
 
 
+@test_util.with_eager_op_as_function
 class Tests(test.TestCase):
 
   @test_util.assert_no_new_tensors
@@ -254,12 +255,13 @@ def testSlowPathExecute_VeryLargeOutputs(self):
   @test_util.assert_no_garbage_created
   def testInvalidNumOutputs(self):
     with self.assertRaisesRegex(
-        Exception, r"Value for number_attr\(\) -1 < 0 \[Op:Split\]"):
+        Exception, r"Value for number_attr\(\) -1 < 0 \[Op:Split\]|"
+        r"Value for attr 'num_split' of -1 must be at least minimum 1"):
       array_ops.split(value=[1, 2, 3], num_or_size_splits=-1)
 
     with self.assertRaisesRegex(
         Exception,
-        "Value for attr 'num_split' of 0 must be at least minimum 1"):
+        r"Value for attr 'num_split' of 0 must be at least minimum 1"):
       array_ops.split(value=[1, 2, 3], num_or_size_splits=0)
 
   def testIsFunction(self):
diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index 98de8ac1beb172..49697552733274 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -277,7 +277,7 @@ py_library(
         ":tensor_shape",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:execute",
-        "//tensorflow/python/profiler:traceme",
+        "//tensorflow/python/profiler:trace",
     ],
 )
 
@@ -905,7 +905,7 @@ py_library(
         "//tensorflow/python/eager:core",
         "//tensorflow/python/eager:monitoring",
         "//tensorflow/python/eager:tape",
-        "//tensorflow/python/profiler:traceme",
+        "//tensorflow/python/profiler:trace",
         "//tensorflow/python/util",
         "@six_archive//:six",
     ],
@@ -1407,7 +1407,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":_python_memory_checker_helper",
-        "//tensorflow/python/profiler:traceme",
+        "//tensorflow/python/profiler:trace",
     ],
 )
 
@@ -1468,6 +1468,7 @@ tf_py_test(
     srcs = ["error_interpolation_test.py"],
     main = "error_interpolation_test.py",
     python_version = "PY3",
+    tags = ["no_oss_py310"],  # b/209088097
     deps = [
         ":constant_op",
         ":error_interpolation",
@@ -1911,6 +1912,7 @@ tf_py_test(
         ":test_lib",
         ":test_ops",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:lookup_ops",
         "//tensorflow/python:platform_test",
diff --git a/tensorflow/python/framework/memory_checker.py b/tensorflow/python/framework/memory_checker.py
index fcbd6f8c54f065..6ab7630a441042 100644
--- a/tensorflow/python/framework/memory_checker.py
+++ b/tensorflow/python/framework/memory_checker.py
@@ -15,8 +15,7 @@
 """Memory leak detection utility."""
 
 from tensorflow.python.framework.python_memory_checker import _PythonMemoryChecker
-from tensorflow.python.profiler.traceme import TraceMe
-from tensorflow.python.profiler.traceme import traceme_wrapper
+from tensorflow.python.profiler import trace
 from tensorflow.python.util import tf_inspect
 
 try:
@@ -68,22 +67,20 @@ class MemoryChecker(object):
   is a leak, it's happening similarly on every snapshot.
   """
 
-  @traceme_wrapper
+  @trace.trace_wrapper
   def __enter__(self):
-    self._trace_me = TraceMe('with MemoryChecker():')
-    self._trace_me.__enter__()
     self._python_memory_checker = _PythonMemoryChecker()
     if CppMemoryChecker:
       self._cpp_memory_checker = CppMemoryChecker(_get_test_name_best_effort())
     return self
 
-  @traceme_wrapper
+  @trace.trace_wrapper
   def __exit__(self, exc_type, exc_value, traceback):
     if CppMemoryChecker:
       self._cpp_memory_checker.stop()
-    self._trace_me.__exit__(exc_type, exc_value, traceback)
 
-  @traceme_wrapper
+  # We do not enable trace_wrapper on this function to avoid contaminating
+  # the snapshot.
   def record_snapshot(self):
     """Take a memory snapshot for later analysis.
 
@@ -98,7 +95,7 @@ def record_snapshot(self):
     if CppMemoryChecker:
       self._cpp_memory_checker.record_snapshot()
 
-  @traceme_wrapper
+  @trace.trace_wrapper
   def report(self):
     """Generates a html graph file showing allocations over snapshots.
 
@@ -110,7 +107,7 @@ def report(self):
     if CppMemoryChecker:
       self._cpp_memory_checker.report()
 
-  @traceme_wrapper
+  @trace.trace_wrapper
   def assert_no_leak_if_all_possibly_except_one(self):
     """Raises an exception if a leak is detected.
 
@@ -123,7 +120,7 @@ def assert_no_leak_if_all_possibly_except_one(self):
     if CppMemoryChecker:
       self._cpp_memory_checker.assert_no_leak_if_all_possibly_except_one()
 
-  @traceme_wrapper
+  @trace.trace_wrapper
   def assert_no_new_python_objects(self, threshold=None):
     """Raises an exception if there are new Python objects created.
 
diff --git a/tensorflow/python/framework/memory_checker_test.py b/tensorflow/python/framework/memory_checker_test.py
index ba9ec667b696bc..573f8b3c8fd995 100644
--- a/tensorflow/python/framework/memory_checker_test.py
+++ b/tensorflow/python/framework/memory_checker_test.py
@@ -119,39 +119,36 @@ def testLeak4(self):
       memory_checker.assert_no_leak_if_all_possibly_except_one()
 
   def testNoNewPythonObjectsEmpty(self):
-    self.skipTest('TODO(b/150324603): Flaky test.')
     with MemoryChecker() as memory_checker:
       memory_checker.record_snapshot()
       memory_checker.record_snapshot()
 
-    # TODO(kkb): All the builtins below are unexpected, locate and fix it.
-    memory_checker.assert_no_new_python_objects(
-        threshold={'builtins.weakref': 1,
-                   'builtins.function': 1})
+    memory_checker.assert_no_new_python_objects()
 
   def testNewPythonObjects(self):
     with MemoryChecker() as memory_checker:
       memory_checker.record_snapshot()
-      x = constant_op.constant(1)  # pylint: disable=unused-variable
+      x = constant_op.constant(1)
       memory_checker.record_snapshot()
 
     with self.assertRaisesRegex(AssertionError, 'New Python objects'):
       memory_checker.assert_no_new_python_objects()
 
+    # use x to avoid any potential for optimizing it away.
+    self.assertIsNot(x, None)
+
   def testNewPythonObjectBelowThreshold(self):
-    self.skipTest('This test is flaky: b/206443120.')
 
     class Foo(object):
       pass
 
     with MemoryChecker() as memory_checker:
       memory_checker.record_snapshot()
-      foo = Foo()  # pylint: disable=unused-variable
+      foo = Foo()
+      del foo
       memory_checker.record_snapshot()
 
-    memory_checker.assert_no_new_python_objects(threshold={
-        '__main__.Foo': 1,
-    })
+    memory_checker.assert_no_new_python_objects()
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 294d76fa16b1aa..c83aed8e329faa 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -292,8 +292,10 @@ def __init__(self, signature_context, shape, dtype, name):
     self.name = name
     self.shape_rank = shape.rank
 
-    if self.shape_rank is None or signature_context.include_tensor_ranks_only:
+    if self.shape_rank is None:
       self.shape_dims = None
+    elif signature_context.include_tensor_ranks_only:
+      self.shape_dims = (None,) * self.shape_rank
     else:
       self.shape_dims = tuple(shape.as_list())
 
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 6270644603d555..ced09075b8cb93 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -472,10 +472,11 @@ def testBitwiseNotErrors(self):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class TensorTypeTest(test_util.TensorFlowTestCase):
+class TensorTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
-  def testEqualTypes(self):
-    signature_context = function_trace_type.SignatureContext(False)
+  @parameterized.parameters([True, False])
+  def testEqualTypes(self, shape_relaxation):
+    signature_context = function_trace_type.SignatureContext(shape_relaxation)
     type_1 = ops.TensorType(signature_context,
                             tensor_shape.TensorShape([1, 2, 3]), dtypes.float32,
                             None)
@@ -488,8 +489,9 @@ def testEqualTypes(self):
     self.assertTrue(type_2.is_subtype_of(type_1))
     self.assertTrue(type_1.is_subtype_of(type_2))
 
-  def testDtypeMismatch(self):
-    signature_context = function_trace_type.SignatureContext(False)
+  @parameterized.parameters([True, False])
+  def testDtypeMismatch(self, shape_relaxation):
+    signature_context = function_trace_type.SignatureContext(shape_relaxation)
     type_1 = ops.TensorType(signature_context,
                             tensor_shape.TensorShape([1, 2, 3]), dtypes.float32,
                             None)
@@ -500,8 +502,9 @@ def testDtypeMismatch(self):
     self.assertFalse(type_2.is_subtype_of(type_1))
     self.assertFalse(type_1.is_subtype_of(type_2))
 
-  def testSubtypeOfShapeless(self):
-    signature_context = function_trace_type.SignatureContext(False)
+  @parameterized.parameters([True, False])
+  def testSubtypeOfShapeless(self, shape_relaxation):
+    signature_context = function_trace_type.SignatureContext(shape_relaxation)
     type_1 = ops.TensorType(signature_context, tensor_shape.TensorShape(None),
                             dtypes.float32, None)
     type_2 = ops.TensorType(signature_context,
diff --git a/tensorflow/python/framework/python_memory_checker.py b/tensorflow/python/framework/python_memory_checker.py
index f0acd77fdfa9fd..aa0d2023585bf5 100644
--- a/tensorflow/python/framework/python_memory_checker.py
+++ b/tensorflow/python/framework/python_memory_checker.py
@@ -23,7 +23,7 @@
 
 from tensorflow.python.framework import _python_memory_checker_helper
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.profiler.traceme import traceme_wrapper
+from tensorflow.python.profiler import trace
 
 
 def _get_typename(obj):
@@ -62,8 +62,15 @@ class _PythonMemoryChecker(object):
 
   def __init__(self):
     self._snapshots = []
+    # cache the function used by mark_stack_trace_and_call to avoid
+    # contaminating the leak measurement.
+    def _record_snapshot():
+      self._snapshots.append(_create_python_object_snapshot())
 
-  @traceme_wrapper
+    self._record_snapshot = _record_snapshot
+
+  # We do not enable trace_wrapper on this function to avoid contaminating
+  # the snapshot.
   def record_snapshot(self):
     # Function called using `mark_stack_trace_and_call` will have
     # "_python_memory_checker_helper" string in the C++ stack trace.  This will
@@ -71,14 +78,14 @@ def record_snapshot(self):
     # because we are not interested in detecting memory growth caused by memory
     # checker itself.
     _python_memory_checker_helper.mark_stack_trace_and_call(
-        lambda: self._snapshots.append(_create_python_object_snapshot()))
+        self._record_snapshot)
 
-  @traceme_wrapper
+  @trace.trace_wrapper
   def report(self):
     # TODO(kkb): Implement.
     pass
 
-  @traceme_wrapper
+  @trace.trace_wrapper
   def assert_no_leak_if_all_possibly_except_one(self):
     """Raises an exception if a leak is detected.
 
@@ -108,7 +115,7 @@ def assert_no_leak_if_all_possibly_except_one(self):
           'These Python objects were allocated in every snapshot possibly '
           f'except one.\n\n{object_list_to_print}')
 
-  @traceme_wrapper
+  @trace.trace_wrapper
   def assert_no_new_objects(self, threshold=None):
     """Assert no new Python objects."""
 
@@ -130,13 +137,13 @@ def assert_no_new_objects(self, threshold=None):
                       f'{threshold}\n\nNew Python objects:\n'
                       f'{original_count_diff.most_common()}')
 
-  @traceme_wrapper
+  @trace.trace_wrapper
   def _snapshot_diff(self, old_index, new_index):
     return _snapshot_diff(self._snapshots[old_index],
                           self._snapshots[new_index],
                           self._get_internal_object_ids())
 
-  @traceme_wrapper
+  @trace.trace_wrapper
   def _get_internal_object_ids(self):
     ids = set()
     for snapshot in self._snapshots:
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index fe68a9dba4a0ce..b0755f95c61944 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -3413,23 +3413,33 @@ def assertRaisesIncompatibleShapesError(
         exception_type, r"Incompatible shapes|Dimensions must be equal|"
         r"required broadcastable shapes")
 
-  def assertShapeEqual(self, np_array, tf_tensor, msg=None):
-    """Asserts that a Numpy ndarray and a TensorFlow tensor have the same shape.
+  def assertShapeEqual(self, input_a, input_b, msg=None):
+    """Asserts that two Numpy or TensorFlow objects have the same shape.
+
+    For Tensors, this compares statically known shapes at compile time, not
+    dynamic shapes at runtime.
 
     Args:
-      np_array: A Numpy ndarray or Numpy scalar.
-      tf_tensor: A Tensor.
+      input_a: A Numpy ndarray, Numpy scalar, or a Tensor.
+      input_b: A Numpy ndarray, Numpy scalar, or a Tensor.
       msg: Optional message to report on failure.
 
     Raises:
       TypeError: If the arguments have the wrong type.
     """
-    if not isinstance(np_array, (np.ndarray, np.generic)):
-      raise TypeError("np_array must be a Numpy ndarray or Numpy scalar")
-    if not isinstance(tf_tensor, ops.Tensor):
-      raise TypeError("tf_tensor must be a Tensor")
-    self.assertAllEqual(
-        np_array.shape, tf_tensor.get_shape().as_list(), msg=msg)
+    if not isinstance(input_a, (np.ndarray, np.generic, ops.Tensor)):
+      raise TypeError(
+          "input_a must be a Numpy ndarray, Numpy scalar, or a Tensor."
+          f"Instead received {type(input_a)}")
+    if not isinstance(input_b, (np.ndarray, np.generic, ops.Tensor)):
+      raise TypeError(
+          "input_b must be a Numpy ndarray, Numpy scalar, or a Tensor."
+          f"Instead received {type(input_b)}")
+    shape_a = input_a.get_shape().as_list() if isinstance(
+        input_a, ops.Tensor) else input_a.shape
+    shape_b = input_b.get_shape().as_list() if isinstance(
+        input_b, ops.Tensor) else input_b.shape
+    self.assertAllEqual(shape_a, shape_b, msg=msg)
 
   def assertDeviceEqual(self, device1, device2, msg=None):
     """Asserts that the two given devices are the same.
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 4c00face496f4f..6f4b65ed6e63ed 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -40,6 +40,7 @@
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_ops  # pylint: disable=unused-import
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
@@ -711,6 +712,65 @@ def testAssertAllInSet(self):
     with self.assertRaises(AssertionError):
       self.assertAllInSet(x, (42,))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testAssertShapeEqualSameInputTypes(self):
+    # Test with arrays
+    array_a = np.random.rand(3, 1)
+    array_b = np.random.rand(3, 1)
+    array_c = np.random.rand(4, 2)
+
+    self.assertShapeEqual(array_a, array_b)
+    with self.assertRaises(AssertionError):
+      self.assertShapeEqual(array_a, array_c)
+
+    # Test with tensors
+    tensor_x = random_ops.random_uniform((5, 2, 1))
+    tensor_y = random_ops.random_uniform((5, 2, 1))
+    tensor_z = random_ops.random_uniform((2, 4))
+
+    self.assertShapeEqual(tensor_x, tensor_y)
+    with self.assertRaises(AssertionError):
+      self.assertShapeEqual(tensor_x, tensor_z)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAssertShapeEqualMixedInputTypes(self):
+
+    # Test mixed multi-dimensional inputs
+    array_input = np.random.rand(4, 3, 2)
+    tensor_input = random_ops.random_uniform((4, 3, 2))
+    tensor_input_2 = random_ops.random_uniform((10, 5))
+
+    self.assertShapeEqual(array_input, tensor_input)
+    self.assertShapeEqual(tensor_input, array_input)
+    with self.assertRaises(AssertionError):
+      self.assertShapeEqual(array_input, tensor_input_2)
+
+    # Test with scalar inputs
+    array_input = np.random.rand(1)
+    tensor_input = random_ops.random_uniform((1,))
+    tensor_input_2 = random_ops.random_uniform((3, 1))
+
+    self.assertShapeEqual(array_input, tensor_input)
+    self.assertShapeEqual(tensor_input, array_input)
+    with self.assertRaises(AssertionError):
+      self.assertShapeEqual(array_input, tensor_input_2)
+
+  def testAssertShapeEqualDynamicShapes(self):
+
+    array_a = np.random.rand(4)
+    values = [1, 1, 2, 3, 4, 4]
+
+    # Dynamic shape should be resolved in eager execution.
+    with context.eager_mode():
+      tensor_b = array_ops.unique(values)[0]
+      self.assertShapeEqual(array_a, tensor_b)
+
+    # Shape comparison should fail when a graph is traced but not evaluated.
+    with context.graph_mode():
+      tensor_c = array_ops.unique(values)[0]
+      with self.assertRaises(AssertionError):
+        self.assertShapeEqual(array_a, tensor_c)
+
   def testRandomSeed(self):
     # Call setUp again for WithCApi case (since it makes a new default graph
     # after setup).
diff --git a/tensorflow/python/keras/mixed_precision/BUILD b/tensorflow/python/keras/mixed_precision/BUILD
index 9e9a030711b1ee..69f1b1c867496a 100644
--- a/tensorflow/python/keras/mixed_precision/BUILD
+++ b/tensorflow/python/keras/mixed_precision/BUILD
@@ -216,7 +216,6 @@ cuda_py_test(
         ":test_util",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_v2_toggles",
-        "//tensorflow/python:loss_scaling_gradient_tape",
         "//tensorflow/python/distribute:mirrored_strategy",
         "//tensorflow/python/distribute:one_device_strategy",
         "//tensorflow/python/keras",
diff --git a/tensorflow/python/kernel_tests/array_ops/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops/array_ops_test.py
index 9634184039e1dd..e50dabc8120abd 100644
--- a/tensorflow/python/kernel_tests/array_ops/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/array_ops_test.py
@@ -1580,6 +1580,20 @@ def testUnravelIndexZeroDim(self):
           dims = constant_op.constant([3, 0], dtype=dtype)
           self.evaluate(array_ops.unravel_index(indices=indices, dims=dims))
 
+  def testUnravelIndexIntegerOverflow(self):
+    with self.cached_session():
+      for dtype in [dtypes.int32, dtypes.int64]:
+        with self.assertRaisesRegex(
+            errors.InvalidArgumentError,
+            r"Input dims product is causing integer overflow"):
+          indices = constant_op.constant(-0x100000, dtype=dtype)
+          if dtype == dtypes.int32:
+            value = 0x10000000
+          else:
+            value = 0x7FFFFFFFFFFFFFFF
+          dims = constant_op.constant([value, value], dtype=dtype)
+          self.evaluate(array_ops.unravel_index(indices=indices, dims=dims))
+
 
 class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/kernel_tests/array_ops/broadcast_to_ops_test.py b/tensorflow/python/kernel_tests/array_ops/broadcast_to_ops_test.py
index 1d1a3b461728d0..44986875a0fb7d 100644
--- a/tensorflow/python/kernel_tests/array_ops/broadcast_to_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/broadcast_to_ops_test.py
@@ -97,8 +97,8 @@ def testBroadcastToScalar(self):
   def testBroadcastScalarToNonScalar(self):
     with self.session():
       x = np.array(1.0, dtype=np.float64)
-      v_tf = array_ops.broadcast_to(constant_op.constant(1.0), [2, 3, 4,
-                                                                1, 1, 1])
+      v_tf = array_ops.broadcast_to(
+          constant_op.constant(1.0), [2, 3, 4, 1, 1, 1])
       v_np = np.broadcast_to(x, [2, 3, 4, 1, 1, 1])
       self.assertAllEqual(v_tf, v_np)
 
@@ -107,8 +107,7 @@ def testBroadcastToShapeTypeAndInference(self):
       with self.cached_session():
         x = np.array([1, 2, 3])
         v_tf = array_ops.broadcast_to(
-            constant_op.constant(x),
-            constant_op.constant([3, 3], dtype=dtype))
+            constant_op.constant(x), constant_op.constant([3, 3], dtype=dtype))
         shape = v_tf.get_shape().as_list()
         v_np = np.broadcast_to(x, [3, 3])
         self.assertAllEqual(v_tf, v_np)
@@ -137,8 +136,9 @@ def func(x):
     self.assertLess(err, 1e-4)
 
   def testGradientWithSameRank(self):
-    x = constant_op.constant(np.reshape(np.arange(6), (2, 1, 3)),
-                             dtype=dtypes.float32)
+    x = constant_op.constant(
+        np.reshape(np.arange(6), (2, 1, 3)), dtype=dtypes.float32)
+
     def func(x):
       v = array_ops.broadcast_to(x, [2, 5, 3])
       return 2 * v
@@ -150,8 +150,8 @@ def func(x):
     self.assertLess(err, 1e-4)
 
   def testGradientWithIncreasingRank(self):
-    x = constant_op.constant([[1], [2]],
-                             dtype=dtypes.float32)
+    x = constant_op.constant([[1], [2]], dtype=dtypes.float32)
+
     def func(x):
       v = array_ops.broadcast_to(x, [5, 2, 3])
       return 2 * v
@@ -164,6 +164,7 @@ def func(x):
 
   def testGradientWithBroadcastAllDimensions(self):
     x = constant_op.constant([1], dtype=dtypes.float32)
+
     def func(x):
       v = array_ops.broadcast_to(x, [5, 2, 3])
       return 2 * v
@@ -177,8 +178,9 @@ def func(x):
   def testGradientWithLargeDim(self):
     input_shape = [2, 1, 3, 2, 2, 2, 1, 1, 1]
     output_shape = [1, 1, 1, 2, 5, 3, 2, 2, 2, 3, 3, 3]
-    x = constant_op.constant(np.array(np.random.randn(*input_shape),
-                                      dtype=np.float32))
+    x = constant_op.constant(
+        np.array(np.random.randn(*input_shape), dtype=np.float32))
+
     def func(x):
       v = array_ops.broadcast_to(x, output_shape)
       return 2 * v
@@ -197,6 +199,13 @@ def testBroadcastToInvalidShape(self):
       v = array_ops.broadcast_to(constant_op.constant(x), output_shape)
       self.evaluate(v)
 
+  def testBroadcastToInvalidShapeForEmpty(self):
+    with self.assertRaisesIncompatibleShapesError(
+        (ValueError, errors.InvalidArgumentError)):
+      output_shape = [3, 0, 3]
+      x = constant_op.constant(value=[], shape=(3, 0, 5), dtype=np.int32)
+      v = array_ops.broadcast_to(x, output_shape)
+      self.evaluate(v)
 
 if __name__ == "__main__":
   test_lib.main()
diff --git a/tensorflow/python/kernel_tests/array_ops/denormal_test.py b/tensorflow/python/kernel_tests/array_ops/denormal_test.py
index 120df9bc238fa2..a7b2b07e213d9e 100644
--- a/tensorflow/python/kernel_tests/array_ops/denormal_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/denormal_test.py
@@ -24,6 +24,7 @@
 from tensorflow.python.platform import test
 
 
+@test_util.with_eager_op_as_function
 class DenormalTest(test.TestCase):
 
   def testPythonHasDenormals(self):
@@ -36,7 +37,8 @@ def _flushDenormalsTest(self, dtypes):
     if (platform.machine() == "ppc64le" or platform.machine() == "s390x" or
         platform.machine() == "aarch64"):
       # Disabled denormal_test on power/s390x/aarch64 platform
-      # Check relevant discussion - https://github.com/tensorflow/tensorflow/issues/11902
+      # Check relevant discussion -
+      # https://github.com/tensorflow/tensorflow/issues/11902
       return
     for dtype in dtypes:
       tiny = np.finfo(dtype).tiny
diff --git a/tensorflow/python/kernel_tests/data_structures/map_stage_op_test.py b/tensorflow/python/kernel_tests/data_structures/map_stage_op_test.py
index 313b5244ee15c3..8600ad1f8d726b 100644
--- a/tensorflow/python/kernel_tests/data_structures/map_stage_op_test.py
+++ b/tensorflow/python/kernel_tests/data_structures/map_stage_op_test.py
@@ -12,8 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-from tensorflow.python.framework import errors
+import numpy as np
+
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -28,7 +31,7 @@ class MapStageTest(test.TestCase):
 
   @test_util.run_deprecated_v1
   def testSimple(self):
-    with ops.Graph().as_default() as G:
+    with ops.Graph().as_default() as g:
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.float32)
         pi = array_ops.placeholder(dtypes.int64)
@@ -40,9 +43,9 @@ def testSimple(self):
         k, y = stager.get(gi)
         y = math_ops.reduce_max(math_ops.matmul(y, y))
 
-    G.finalize()
+    g.finalize()
 
-    with self.session(graph=G) as sess:
+    with self.session(graph=g) as sess:
       sess.run(stage, feed_dict={x: -1, pi: 0})
       for i in range(10):
         _, yval = sess.run([stage, y], feed_dict={x: i, pi: i + 1, gi: i})
@@ -50,7 +53,7 @@ def testSimple(self):
 
   @test_util.run_deprecated_v1
   def testMultiple(self):
-    with ops.Graph().as_default() as G:
+    with ops.Graph().as_default() as g:
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.float32)
         pi = array_ops.placeholder(dtypes.int64)
@@ -62,9 +65,9 @@ def testMultiple(self):
         k, (z, y) = stager.get(gi)
         y = math_ops.reduce_max(z * math_ops.matmul(y, y))
 
-    G.finalize()
+    g.finalize()
 
-    with self.session(graph=G) as sess:
+    with self.session(graph=g) as sess:
       sess.run(stage, feed_dict={x: -1, pi: 0})
       for i in range(10):
         _, yval = sess.run([stage, y], feed_dict={x: i, pi: i + 1, gi: i})
@@ -73,26 +76,25 @@ def testMultiple(self):
 
   @test_util.run_deprecated_v1
   def testDictionary(self):
-    with ops.Graph().as_default() as G:
+    with ops.Graph().as_default() as g:
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.float32)
         pi = array_ops.placeholder(dtypes.int64)
         gi = array_ops.placeholder(dtypes.int64)
         v = 2. * (array_ops.zeros([128, 128]) + x)
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.MapStagingArea(
-            [dtypes.float32, dtypes.float32],
-            shapes=[[], [128, 128]],
-            names=['x', 'v'])
+        stager = data_flow_ops.MapStagingArea([dtypes.float32, dtypes.float32],
+                                              shapes=[[], [128, 128]],
+                                              names=['x', 'v'])
         stage = stager.put(pi, {'x': x, 'v': v})
         key, ret = stager.get(gi)
         z = ret['x']
         y = ret['v']
         y = math_ops.reduce_max(z * math_ops.matmul(y, y))
 
-    G.finalize()
+    g.finalize()
 
-    with self.session(graph=G) as sess:
+    with self.session(graph=g) as sess:
       sess.run(stage, feed_dict={x: -1, pi: 0})
       for i in range(10):
         _, yval = sess.run([stage, y], feed_dict={x: i, pi: i + 1, gi: i})
@@ -102,7 +104,7 @@ def testDictionary(self):
   def testColocation(self):
     gpu_dev = test.gpu_device_name()
 
-    with ops.Graph().as_default() as G:
+    with ops.Graph().as_default() as g:
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.float32)
         v = 2. * (array_ops.zeros([128, 128]) + x)
@@ -119,58 +121,56 @@ def testColocation(self):
         self.assertEqual(y.device, '/device:CPU:0')
         self.assertEqual(z[0].device, '/device:CPU:0')
 
-    G.finalize()
+    g.finalize()
 
   @test_util.run_deprecated_v1
   def testPeek(self):
-    with ops.Graph().as_default() as G:
+    with ops.Graph().as_default() as g:
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.int32, name='x')
         pi = array_ops.placeholder(dtypes.int64)
         gi = array_ops.placeholder(dtypes.int64)
         p = array_ops.placeholder(dtypes.int32, name='p')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.MapStagingArea(
-            [
-                dtypes.int32,
-            ], shapes=[[]])
+        stager = data_flow_ops.MapStagingArea([
+            dtypes.int32,
+        ], shapes=[[]])
         stage = stager.put(pi, [x], [0])
         peek = stager.peek(gi)
         size = stager.size()
 
-    G.finalize()
+    g.finalize()
 
     n = 10
 
-    with self.session(graph=G) as sess:
+    with self.session(graph=g) as sess:
       for i in range(n):
         sess.run(stage, feed_dict={x: i, pi: i})
 
       for i in range(n):
-        self.assertTrue(sess.run(peek, feed_dict={gi: i})[0] == i)
+        self.assertEqual(sess.run(peek, feed_dict={gi: i})[0], i)
 
-      self.assertTrue(sess.run(size) == 10)
+      self.assertEqual(sess.run(size), 10)
 
   @test_util.run_deprecated_v1
   def testSizeAndClear(self):
-    with ops.Graph().as_default() as G:
+    with ops.Graph().as_default() as g:
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.float32, name='x')
         pi = array_ops.placeholder(dtypes.int64)
         gi = array_ops.placeholder(dtypes.int64)
         v = 2. * (array_ops.zeros([128, 128]) + x)
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.MapStagingArea(
-            [dtypes.float32, dtypes.float32],
-            shapes=[[], [128, 128]],
-            names=['x', 'v'])
+        stager = data_flow_ops.MapStagingArea([dtypes.float32, dtypes.float32],
+                                              shapes=[[], [128, 128]],
+                                              names=['x', 'v'])
         stage = stager.put(pi, {'x': x, 'v': v})
         size = stager.size()
         clear = stager.clear()
 
-    G.finalize()
+    g.finalize()
 
-    with self.session(graph=G) as sess:
+    with self.session(graph=g) as sess:
       sess.run(stage, feed_dict={x: -1, pi: 3})
       self.assertEqual(sess.run(size), 1)
       sess.run(stage, feed_dict={x: -1, pi: 1})
@@ -182,22 +182,23 @@ def testSizeAndClear(self):
   def testCapacity(self):
     capacity = 3
 
-    with ops.Graph().as_default() as G:
+    with ops.Graph().as_default() as g:
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.int32, name='x')
         pi = array_ops.placeholder(dtypes.int64, name='pi')
         gi = array_ops.placeholder(dtypes.int64, name='gi')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.MapStagingArea(
-            [
-                dtypes.int32,
-            ], capacity=capacity, shapes=[[]])
+        stager = data_flow_ops.MapStagingArea([
+            dtypes.int32,
+        ],
+                                              capacity=capacity,
+                                              shapes=[[]])
 
       stage = stager.put(pi, [x], [0])
       get = stager.get()
       size = stager.size()
 
-    G.finalize()
+    g.finalize()
 
     from six.moves import queue as Queue
     import threading
@@ -205,7 +206,7 @@ def testCapacity(self):
     queue = Queue.Queue()
     n = 8
 
-    with self.session(graph=G) as sess:
+    with self.session(graph=g) as sess:
       # Stage data in a separate thread which will block
       # when it hits the staging area's capacity and thus
       # not fill the queue with n tokens
@@ -234,13 +235,13 @@ def thread_run():
                                              capacity))
 
       # Should have capacity elements in the staging area
-      self.assertTrue(sess.run(size) == capacity)
+      self.assertEqual(sess.run(size), capacity)
 
       # Clear the staging area completely
       for i in range(n):
         sess.run(get)
 
-      self.assertTrue(sess.run(size) == 0)
+      self.assertEqual(sess.run(size), 0)
 
   @test_util.run_deprecated_v1
   def testMemoryLimit(self):
@@ -248,28 +249,28 @@ def testMemoryLimit(self):
     chunk = 200 * 1024  # 256K
     capacity = memory_limit // chunk
 
-    with ops.Graph().as_default() as G:
+    with ops.Graph().as_default() as g:
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.uint8, name='x')
         pi = array_ops.placeholder(dtypes.int64, name='pi')
         gi = array_ops.placeholder(dtypes.int64, name='gi')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.MapStagingArea(
-            [dtypes.uint8], memory_limit=memory_limit, shapes=[[]])
+        stager = data_flow_ops.MapStagingArea([dtypes.uint8],
+                                              memory_limit=memory_limit,
+                                              shapes=[[]])
         stage = stager.put(pi, [x], [0])
         get = stager.get()
         size = stager.size()
 
-    G.finalize()
+    g.finalize()
 
     from six.moves import queue as Queue
     import threading
-    import numpy as np
 
     queue = Queue.Queue()
     n = 8
 
-    with self.session(graph=G) as sess:
+    with self.session(graph=g) as sess:
       # Stage data in a separate thread which will block
       # when it hits the staging area's capacity and thus
       # not fill the queue with n tokens
@@ -299,56 +300,57 @@ def thread_run():
                                              capacity))
 
       # Should have capacity elements in the staging area
-      self.assertTrue(sess.run(size) == capacity)
+      self.assertEqual(sess.run(size), capacity)
 
       # Clear the staging area completely
       for i in range(n):
         sess.run(get)
 
-      self.assertTrue(sess.run(size) == 0)
+      self.assertEqual(sess.run(size), 0)
 
   @test_util.run_deprecated_v1
   def testOrdering(self):
     import six
     import random
 
-    with ops.Graph().as_default() as G:
+    with ops.Graph().as_default() as g:
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.int32, name='x')
         pi = array_ops.placeholder(dtypes.int64, name='pi')
         gi = array_ops.placeholder(dtypes.int64, name='gi')
       with ops.device(test.gpu_device_name()):
-        stager = data_flow_ops.MapStagingArea(
-            [
-                dtypes.int32,
-            ], shapes=[[]], ordered=True)
+        stager = data_flow_ops.MapStagingArea([
+            dtypes.int32,
+        ],
+                                              shapes=[[]],
+                                              ordered=True)
         stage = stager.put(pi, [x], [0])
         get = stager.get()
         size = stager.size()
 
-    G.finalize()
+    g.finalize()
 
     n = 10
 
-    with self.session(graph=G) as sess:
+    with self.session(graph=g) as sess:
       # Keys n-1..0
       keys = list(reversed(six.moves.range(n)))
 
       for i in keys:
         sess.run(stage, feed_dict={pi: i, x: i})
 
-      self.assertTrue(sess.run(size) == n)
+      self.assertEqual(sess.run(size), n)
 
       # Check that key, values come out in ascending order
       for i, k in enumerate(reversed(keys)):
         get_key, values = sess.run(get)
         self.assertTrue(i == k == get_key == values)
 
-      self.assertTrue(sess.run(size) == 0)
+      self.assertEqual(sess.run(size), 0)
 
   @test_util.run_deprecated_v1
   def testPartialDictInsert(self):
-    with ops.Graph().as_default() as G:
+    with ops.Graph().as_default() as g:
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.float32)
         f = array_ops.placeholder(dtypes.float32)
@@ -366,41 +368,39 @@ def testPartialDictInsert(self):
         size = stager.size()
         isize = stager.incomplete_size()
 
-    G.finalize()
+    g.finalize()
 
-    with self.session(graph=G) as sess:
+    with self.session(graph=g) as sess:
       # 0 complete and incomplete entries
-      self.assertTrue(sess.run([size, isize]) == [0, 0])
+      self.assertEqual(sess.run([size, isize]), [0, 0])
       # Stage key 0, x and f tuple entries
       sess.run(stage_xf, feed_dict={pi: 0, x: 1, f: 2})
-      self.assertTrue(sess.run([size, isize]) == [0, 1])
+      self.assertEqual(sess.run([size, isize]), [0, 1])
       # Stage key 1, x and f tuple entries
       sess.run(stage_xf, feed_dict={pi: 1, x: 1, f: 2})
-      self.assertTrue(sess.run([size, isize]) == [0, 2])
+      self.assertEqual(sess.run([size, isize]), [0, 2])
 
       # Now complete key 0 with tuple entry v
       sess.run(stage_v, feed_dict={pi: 0, v: 1})
       # 1 complete and 1 incomplete entry
-      self.assertTrue(sess.run([size, isize]) == [1, 1])
+      self.assertEqual(sess.run([size, isize]), [1, 1])
       # We can now obtain tuple associated with key 0
-      self.assertTrue(
-          sess.run([key, ret], feed_dict={
-              gi: 0
-          }) == [0, {
+      self.assertEqual(
+          sess.run([key, ret], feed_dict={gi: 0}),
+          [0, {
               'x': 1,
               'f': 2,
               'v': 1
           }])
 
       # 0 complete and 1 incomplete entry
-      self.assertTrue(sess.run([size, isize]) == [0, 1])
+      self.assertEqual(sess.run([size, isize]), [0, 1])
       # Now complete key 1 with tuple entry v
       sess.run(stage_v, feed_dict={pi: 1, v: 3})
       # We can now obtain tuple associated with key 1
-      self.assertTrue(
-          sess.run([key, ret], feed_dict={
-              gi: 1
-          }) == [1, {
+      self.assertEqual(
+          sess.run([key, ret], feed_dict={gi: 1}),
+          [1, {
               'x': 1,
               'f': 2,
               'v': 3
@@ -408,7 +408,7 @@ def testPartialDictInsert(self):
 
   @test_util.run_deprecated_v1
   def testPartialIndexInsert(self):
-    with ops.Graph().as_default() as G:
+    with ops.Graph().as_default() as g:
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.float32)
         f = array_ops.placeholder(dtypes.float32)
@@ -424,35 +424,35 @@ def testPartialIndexInsert(self):
         size = stager.size()
         isize = stager.incomplete_size()
 
-    G.finalize()
+    g.finalize()
 
-    with self.session(graph=G) as sess:
+    with self.session(graph=g) as sess:
       # 0 complete and incomplete entries
-      self.assertTrue(sess.run([size, isize]) == [0, 0])
+      self.assertEqual(sess.run([size, isize]), [0, 0])
       # Stage key 0, x and f tuple entries
       sess.run(stage_xf, feed_dict={pi: 0, x: 1, f: 2})
-      self.assertTrue(sess.run([size, isize]) == [0, 1])
+      self.assertEqual(sess.run([size, isize]), [0, 1])
       # Stage key 1, x and f tuple entries
       sess.run(stage_xf, feed_dict={pi: 1, x: 1, f: 2})
-      self.assertTrue(sess.run([size, isize]) == [0, 2])
+      self.assertEqual(sess.run([size, isize]), [0, 2])
 
       # Now complete key 0 with tuple entry v
       sess.run(stage_v, feed_dict={pi: 0, v: 1})
       # 1 complete and 1 incomplete entry
-      self.assertTrue(sess.run([size, isize]) == [1, 1])
+      self.assertEqual(sess.run([size, isize]), [1, 1])
       # We can now obtain tuple associated with key 0
-      self.assertTrue(sess.run([key, ret], feed_dict={gi: 0}) == [0, [1, 1, 2]])
+      self.assertEqual(sess.run([key, ret], feed_dict={gi: 0}), [0, [1, 1, 2]])
 
       # 0 complete and 1 incomplete entry
-      self.assertTrue(sess.run([size, isize]) == [0, 1])
+      self.assertEqual(sess.run([size, isize]), [0, 1])
       # Now complete key 1 with tuple entry v
       sess.run(stage_v, feed_dict={pi: 1, v: 3})
       # We can now obtain tuple associated with key 1
-      self.assertTrue(sess.run([key, ret], feed_dict={gi: 1}) == [1, [1, 3, 2]])
+      self.assertEqual(sess.run([key, ret], feed_dict={gi: 1}), [1, [1, 3, 2]])
 
   @test_util.run_deprecated_v1
   def testPartialDictGetsAndPeeks(self):
-    with ops.Graph().as_default() as G:
+    with ops.Graph().as_default() as g:
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.float32)
         f = array_ops.placeholder(dtypes.float32)
@@ -476,40 +476,38 @@ def testPartialDictGetsAndPeeks(self):
         size = stager.size()
         isize = stager.incomplete_size()
 
-    G.finalize()
+    g.finalize()
 
-    with self.session(graph=G) as sess:
+    with self.session(graph=g) as sess:
       # 0 complete and incomplete entries
-      self.assertTrue(sess.run([size, isize]) == [0, 0])
+      self.assertEqual(sess.run([size, isize]), [0, 0])
       # Stage key 0, x and f tuple entries
       sess.run(stage_xf, feed_dict={pi: 0, x: 1, f: 2})
-      self.assertTrue(sess.run([size, isize]) == [0, 1])
+      self.assertEqual(sess.run([size, isize]), [0, 1])
       # Stage key 1, x and f tuple entries
       sess.run(stage_xf, feed_dict={pi: 1, x: 1, f: 2})
-      self.assertTrue(sess.run([size, isize]) == [0, 2])
+      self.assertEqual(sess.run([size, isize]), [0, 2])
 
       # Now complete key 0 with tuple entry v
       sess.run(stage_v, feed_dict={pi: 0, v: 1})
       # 1 complete and 1 incomplete entry
-      self.assertTrue(sess.run([size, isize]) == [1, 1])
+      self.assertEqual(sess.run([size, isize]), [1, 1])
 
       # We can now peek at 'x' and 'f' values associated with key 0
-      self.assertTrue(sess.run(peek_xf, feed_dict={pei: 0}) == {'x': 1, 'f': 2})
+      self.assertEqual(sess.run(peek_xf, feed_dict={pei: 0}), {'x': 1, 'f': 2})
       # Peek at 'v' value associated with key 0
-      self.assertTrue(sess.run(peek_v, feed_dict={pei: 0}) == {'v': 1})
+      self.assertEqual(sess.run(peek_v, feed_dict={pei: 0}), {'v': 1})
       # 1 complete and 1 incomplete entry
-      self.assertTrue(sess.run([size, isize]) == [1, 1])
+      self.assertEqual(sess.run([size, isize]), [1, 1])
 
       # We can now obtain 'x' and 'f' values associated with key 0
-      self.assertTrue(
-          sess.run([key_xf, get_xf], feed_dict={
-              gi: 0
-          }) == [0, {
+      self.assertEqual(
+          sess.run([key_xf, get_xf], feed_dict={gi: 0}), [0, {
               'x': 1,
               'f': 2
           }])
       # Still have 1 complete and 1 incomplete entry
-      self.assertTrue(sess.run([size, isize]) == [1, 1])
+      self.assertEqual(sess.run([size, isize]), [1, 1])
 
       # We can no longer get 'x' and 'f' from key 0
       with self.assertRaises(errors.InvalidArgumentError) as cm:
@@ -517,40 +515,36 @@ def testPartialDictGetsAndPeeks(self):
 
       exc_str = ("Tensor at index '0' for key '0' " 'has already been removed.')
 
-      self.assertTrue(exc_str in cm.exception.message)
+      self.assertIn(exc_str, cm.exception.message)
 
       # Obtain 'v' value associated with key 0
-      self.assertTrue(
-          sess.run([key_v, get_v], feed_dict={
-              gi: 0
-          }) == [0, {
+      self.assertEqual(
+          sess.run([key_v, get_v], feed_dict={gi: 0}), [0, {
               'v': 1
           }])
       # 0 complete and 1 incomplete entry
-      self.assertTrue(sess.run([size, isize]) == [0, 1])
+      self.assertEqual(sess.run([size, isize]), [0, 1])
 
       # Now complete key 1 with tuple entry v
       sess.run(stage_v, feed_dict={pi: 1, v: 1})
       # 1 complete and 1 incomplete entry
-      self.assertTrue(sess.run([size, isize]) == [1, 0])
+      self.assertEqual(sess.run([size, isize]), [1, 0])
 
       # Pop without key to obtain 'x' and 'f' values associated with key 1
-      self.assertTrue(sess.run([pop_key_xf, pop_xf]) == [1, {'x': 1, 'f': 2}])
+      self.assertEqual(sess.run([pop_key_xf, pop_xf]), [1, {'x': 1, 'f': 2}])
       # still 1 complete and 1 incomplete entry
-      self.assertTrue(sess.run([size, isize]) == [1, 0])
+      self.assertEqual(sess.run([size, isize]), [1, 0])
       # We can now obtain 'x' and 'f' values associated with key 1
-      self.assertTrue(
-          sess.run([pop_key_v, pop_v], feed_dict={
-              pi: 1
-          }) == [1, {
+      self.assertEqual(
+          sess.run([pop_key_v, pop_v], feed_dict={pi: 1}), [1, {
               'v': 1
           }])
       # Nothing is left
-      self.assertTrue(sess.run([size, isize]) == [0, 0])
+      self.assertEqual(sess.run([size, isize]), [0, 0])
 
   @test_util.run_deprecated_v1
   def testPartialIndexGets(self):
-    with ops.Graph().as_default() as G:
+    with ops.Graph().as_default() as g:
       with ops.device('/cpu:0'):
         x = array_ops.placeholder(dtypes.float32)
         f = array_ops.placeholder(dtypes.float32)
@@ -568,28 +562,72 @@ def testPartialIndexGets(self):
         size = stager.size()
         isize = stager.incomplete_size()
 
-    G.finalize()
+    g.finalize()
 
-    with self.session(graph=G) as sess:
+    with self.session(graph=g) as sess:
       # Stage complete tuple
       sess.run(stage_xvf, feed_dict={pi: 0, x: 1, f: 2, v: 3})
 
-      self.assertTrue(sess.run([size, isize]) == [1, 0])
+      self.assertEqual(sess.run([size, isize]), [1, 0])
 
       # Partial get using indices
-      self.assertTrue(
-          sess.run([key_xf, get_xf], feed_dict={
-              gi: 0
-          }) == [0, [1, 2]])
+      self.assertEqual(
+          sess.run([key_xf, get_xf], feed_dict={gi: 0}), [0, [1, 2]])
 
       # Still some of key 0 left
-      self.assertTrue(sess.run([size, isize]) == [1, 0])
+      self.assertEqual(sess.run([size, isize]), [1, 0])
 
       # Partial get of remaining index
-      self.assertTrue(sess.run([key_v, get_v], feed_dict={gi: 0}) == [0, [3]])
+      self.assertEqual(sess.run([key_v, get_v], feed_dict={gi: 0}), [0, [3]])
 
       # All gone
-      self.assertTrue(sess.run([size, isize]) == [0, 0])
+      self.assertEqual(sess.run([size, isize]), [0, 0])
+
+  @test_util.run_deprecated_v1
+  def testNonScalarKeyOrderedMap(self):
+    with ops.Graph().as_default() as g:
+      x = array_ops.placeholder(dtypes.float32)
+      v = 2. * (array_ops.zeros([128, 128]) + x)
+      t = data_flow_ops.gen_data_flow_ops.ordered_map_stage(
+          key=constant_op.constant(value=[1], shape=(1, 3), dtype=dtypes.int64),
+          indices=np.array([[6]]),
+          values=[x, v],
+          dtypes=[dtypes.int64],
+          capacity=0,
+          memory_limit=0,
+          container='container1',
+          shared_name='',
+          name=None)
+
+    g.finalize()
+
+    with self.session(graph=g) as sess:
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  'key must be an int64 scalar'):
+        sess.run(t, feed_dict={x: 1})
+
+  @test_util.run_deprecated_v1
+  def testNonScalarKeyUnorderedMap(self):
+    with ops.Graph().as_default() as g:
+      x = array_ops.placeholder(dtypes.float32)
+      v = 2. * (array_ops.zeros([128, 128]) + x)
+      t = data_flow_ops.gen_data_flow_ops.map_stage(
+          key=constant_op.constant(value=[1], shape=(1, 3), dtype=dtypes.int64),
+          indices=np.array([[6]]),
+          values=[x, v],
+          dtypes=[dtypes.int64],
+          capacity=0,
+          memory_limit=0,
+          container='container1',
+          shared_name='',
+          name=None)
+
+    g.finalize()
+
+    with self.session(graph=g) as sess:
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  'key must be an int64 scalar'):
+        sess.run(t, feed_dict={x: 1})
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/kernel_tests/nn_ops/cudnn_deterministic_base.py b/tensorflow/python/kernel_tests/nn_ops/cudnn_deterministic_base.py
index 37fdec3379819a..561f73941bf7aa 100644
--- a/tensorflow/python/kernel_tests/nn_ops/cudnn_deterministic_base.py
+++ b/tensorflow/python/kernel_tests/nn_ops/cudnn_deterministic_base.py
@@ -26,10 +26,13 @@
 
 # Notes:
 #
-# Deterministic cuDNN operation is selected by setting either of the two
-# environment variables TF_CUDNN_DETERMINISTIC or TF_DETERMINISTIC_OPS to 'true'
-# or '1' while also not setting the environment variable TF_CUDNN_USE_AUTOTUNE
-# to 'false' or '0'.
+# TensorFlow makes cuDNN run deterministically when op determinism is enabled
+# via tf.config.experimental.enable_op_determinism(). Additionally, setting the
+# environmental variable TF_CUDNN_DETERMINISTIC to 'true' or '1' makes cuDNN run
+# deterministically, although this environemtnal variable is deprecated and will
+# be removed in a future TensorFlow version. Unlike the enable_op_determinism()
+# function, the environmental variable only makes ops using cuDNN deterministic,
+# not all TensorFlow ops.
 #
 # Where both deterministic and non-deterministic cuDNN algorithms are available,
 # selecting determinitic operation will lead to only the deterministic
@@ -60,6 +63,7 @@
 
 
 class ConvolutionTest(test.TestCase):
+  """Tests for deterministic cuDNN functionality."""
 
   def _random_data_op(self, shape):
     # np.random.random_sample can properly interpret either tf.TensorShape or
@@ -90,7 +94,6 @@ def _assert_reproducible(self, operation):
   # algorithms are determnistic.
   @test_util.run_cuda_only
   def testForward(self):
-    np.random.seed(3)
     in_shape = LayerShapeNCDHW(batch=2, channels=3, depth=5, height=7, width=6)
     filter_shape = FilterShape3D(
         depth=3, height=3, width=3, in_channels=3, out_channels=2)
@@ -110,7 +113,6 @@ def testForward(self):
 
   @test_util.run_cuda_only
   def testBackwardFilterGradient(self):
-    np.random.seed(1)
     in_shape = LayerShapeNHWC(batch=8, height=128, width=128, channels=8)
     filter_shape = FilterShape2D(
         height=3, width=3, in_channels=8, out_channels=8)
@@ -122,9 +124,23 @@ def testBackwardFilterGradient(self):
         in_op, filter_shape, out_op, strides=strides, padding=padding)
     self._assert_reproducible(filter_gradient_op)
 
+  @test_util.run_cuda_only
+  def testBackwardFilterGradientWithDilations(self):
+    in_shape = LayerShapeNHWC(batch=8, height=128, width=128, channels=8)
+    filter_shape = FilterShape2D(
+        height=3, width=3, in_channels=8, out_channels=8)
+    in_op = self._random_data_op(in_shape)
+    strides = [1, 1, 1, 1]
+    padding = 'SAME'
+    dilations = [1, 2, 2, 1]
+    out_op = self._random_out_op(in_shape, filter_shape, strides, padding)
+    filter_gradient_op = nn_ops.conv2d_backprop_filter(
+        in_op, filter_shape, out_op, strides=strides, padding=padding,
+        dilations=dilations)
+    self._assert_reproducible(filter_gradient_op)
+
   @test_util.run_cuda_only
   def testBackwardInputGradient(self):
-    np.random.seed(2)
     in_shape = LayerShapeNHWC(batch=8, height=32, width=32, channels=8)
     filter_shape = FilterShape2D(
         height=7, width=7, in_channels=8, out_channels=128)
@@ -135,3 +151,18 @@ def testBackwardInputGradient(self):
     input_gradient_op = nn_ops.conv2d_backprop_input(
         in_shape, filter_op, out_op, strides=strides, padding=padding)
     self._assert_reproducible(input_gradient_op)
+
+  @test_util.run_cuda_only
+  def testBackwardInputGradientWithDilations(self):
+    in_shape = LayerShapeNHWC(batch=8, height=32, width=32, channels=8)
+    filter_shape = FilterShape2D(
+        height=7, width=7, in_channels=8, out_channels=128)
+    filter_op = self._random_data_op(filter_shape)
+    strides = [1, 1, 1, 1]
+    padding = 'SAME'
+    dilations = [1, 2, 2, 1]
+    out_op = self._random_out_op(in_shape, filter_shape, strides, padding)
+    input_gradient_op = nn_ops.conv2d_backprop_input(
+        in_shape, filter_op, out_op, strides=strides, padding=padding,
+        dilations=dilations)
+    self._assert_reproducible(input_gradient_op)
diff --git a/tensorflow/python/kernel_tests/nn_ops/fractional_avg_pool_op_test.py b/tensorflow/python/kernel_tests/nn_ops/fractional_avg_pool_op_test.py
index ae75c424264622..7b153ae1ed7084 100644
--- a/tensorflow/python/kernel_tests/nn_ops/fractional_avg_pool_op_test.py
+++ b/tensorflow/python/kernel_tests/nn_ops/fractional_avg_pool_op_test.py
@@ -20,6 +20,7 @@
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
@@ -306,6 +307,32 @@ def testDifferentInputTensorShape(self):
           input_b, row_seq, col_seq, overlapping)
       self.assertSequenceEqual(expected.shape, actual.shape)
 
+  def testNegativeSeqValuesForGradOp(self):
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        r"Row sequence tensor values must not be negative.*"):
+      y = nn_ops.gen_nn_ops.fractional_avg_pool_grad(
+          orig_input_tensor_shape=[2, 2, 2, 2],
+          out_backprop=[[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11,
+                                                                      12]]]],
+          row_pooling_sequence=[-10, 1, 2, 3],
+          col_pooling_sequence=[1, 2, 3, 4],
+          overlapping=True)
+
+      self.evaluate(y)
+      with self.assertRaisesRegex(
+          errors.InvalidArgumentError,
+          r"Column sequence tensor values must not be negative.*"):
+        z = nn_ops.gen_nn_ops.fractional_avg_pool_grad(
+            orig_input_tensor_shape=[2, 2, 2, 2],
+            out_backprop=[[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11,
+                                                                        12]]]],
+            row_pooling_sequence=[10, 1, 2, 3],
+            col_pooling_sequence=[1, 2, -3, 4],
+            overlapping=True)
+
+        self.evaluate(z)
+
 
 class FractionalAvgPoolGradTest(test.TestCase):
   """Tests for FractionalAvgPoolGrad.
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index 43f42adb66ee95..84ecf46d748e48 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -155,6 +155,7 @@ def testSingleSessionGraphSeedNotConstant(self):
             graph_seed=965)
 
 
+@test_util.with_eager_op_as_function
 class TruncatedNormalTest(test.TestCase):
 
   def _Sampler(self, num, mu, sigma, dtype, use_gpu, seed=None):
@@ -224,7 +225,19 @@ def testStdDev(self):
       sampler = self._Sampler(100000, 0.0, stddev, dt, use_gpu=True)
       x = sampler()
       print("std(x)", np.std(x), abs(np.std(x) / stddev - 0.85))
-      self.assertTrue(abs(np.std(x) / stddev - 0.85) < 0.04)
+      self.assertLess(abs(np.std(x) / stddev - 0.85), 0.04)
+
+  def testSuccessAfterError(self):
+    # Force an error on the TruncatedNormal kernel.
+    config.enable_op_determinism()
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        "When determinism is enabled, random ops must have a seed specified"):
+      self.evaluate(gen_random_ops.truncated_normal((1,), dtypes.float32))
+    config.disable_op_determinism()
+
+    # Ensure the StdDev of the TruncatedNormal works as intended.
+    self.testStdDev()
 
   @test_util.run_deprecated_v1
   def testLargeShape(self):
@@ -255,6 +268,7 @@ def testEagerSeed(self):
       self.assertAllEqual(rnd1, rnd2)
 
 
+@test_util.with_eager_op_as_function
 @test_util.for_all_test_methods(test_util.disable_xla,
                                 "This never passed on XLA")
 class RandomUniformTest(RandomOpTestCommon):
diff --git a/tensorflow/python/kernel_tests/variables/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/variables/resource_variable_ops_test.py
index 2258beebb8b5ec..7e921130c5d5c1 100644
--- a/tensorflow/python/kernel_tests/variables/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/variables/resource_variable_ops_test.py
@@ -58,6 +58,17 @@
 from tensorflow.python.util import compat
 
 
+def _eager_safe_var_handle_op(*args, **kwargs):
+  # When running in eager mode the `shared_name` should be set to the
+  # `anonymous_name` to avoid spurious sharing issues. The runtime generates a
+  # unique name on our behalf when the reserved `anonymous_name` is used as the
+  # `shared_name`.
+  if context.executing_eagerly() and "shared_name" not in kwargs:
+    kwargs["shared_name"] = context.anonymous_name()
+  return resource_variable_ops.var_handle_op(*args, **kwargs)
+
+
+@test_util.with_eager_op_as_function
 @test_util.with_control_flow_v2
 class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
                               parameterized.TestCase):
@@ -72,7 +83,7 @@ def tearDown(self):
   @test_util.run_deprecated_v1
   def testHandleDtypeShapeMatch(self):
     with self.cached_session():
-      handle = resource_variable_ops.var_handle_op(dtype=dtypes.int32, shape=[])
+      handle = _eager_safe_var_handle_op(dtype=dtypes.int32, shape=[])
       with self.assertRaises(ValueError):
         resource_variable_ops.assign_variable_op(
             handle, constant_op.constant(0.0, dtype=dtypes.float32)).run()
@@ -114,13 +125,17 @@ def testEagerNameNotNeeded(self):
 
   def testReadVariableDtypeMismatchEager(self):
     with context.eager_mode():
-      handle = resource_variable_ops.var_handle_op(
+      handle = _eager_safe_var_handle_op(
           dtype=dtypes.int32, shape=[1], name="foo")
       resource_variable_ops.assign_variable_op(handle, 1)
+      # The error message varies depending on whether it is being raised
+      # by the kernel or shape inference. The shape inference code path can
+      # be reached when running in eager op as function mode where each op
+      # is wrapped in a tf.function.
       with self.assertRaisesRegex(
           errors.InvalidArgumentError,
-          "Trying to read variable with wrong dtype. "
-          "Expected float got int32"):
+          r"Trying to read variable with wrong dtype. "
+          r"Expected (float|int32) got (int32|float)"):
         _ = resource_variable_ops.read_variable_op(handle, dtype=dtypes.float32)
 
   def testEagerInitializedValue(self):
@@ -201,7 +216,7 @@ def testDifferentAssignGraph(self):
   @test_util.run_deprecated_v1
   def testFetchHandle(self):
     with self.cached_session():
-      handle = resource_variable_ops.var_handle_op(
+      handle = _eager_safe_var_handle_op(
           dtype=dtypes.int32, shape=[1], name="foo")
       self.assertNotEmpty(self.evaluate(handle))
 
@@ -215,13 +230,17 @@ def testCachedValueReadBeforeWrite(self):
 
   def testAssignVariableDtypeMismatchEager(self):
     with context.eager_mode():
-      handle = resource_variable_ops.var_handle_op(
+      handle = _eager_safe_var_handle_op(
           dtype=dtypes.int32, shape=[1], name="foo")
       resource_variable_ops.assign_variable_op(
           handle, constant_op.constant([1]))
+      # The error message varies depending on whether it is being raised
+      # by the kernel or shape inference. The shape inference code path can
+      # be reached when running in eager op as function mode where each op
+      # is wrapped in a tf.function.
       with self.assertRaisesRegex(
-          errors.InvalidArgumentError, "Trying to assign variable with wrong "
-          "dtype. Expected int32 got float"):
+          errors.InvalidArgumentError, r"Trying to .* variable with wrong "
+          r"dtype. Expected int32 got float"):
         resource_variable_ops.assign_variable_op(
             handle, constant_op.constant([1.], dtype=dtypes.float32))
 
@@ -247,14 +266,14 @@ def broken_read():
 
   def testFormatResourceHandle(self):
     with context.eager_mode():
-      handle = resource_variable_ops.var_handle_op(
+      handle = _eager_safe_var_handle_op(
           dtype=dtypes.int32, shape=[1], name="foo")
       self.assertIn("<Resource Tensor>", str(handle))
       self.assertIn("<Resource Tensor>", repr(handle))
 
   @test_util.run_in_graph_and_eager_modes
   def testDtypeSurvivesIdentity(self):
-    handle = resource_variable_ops.var_handle_op(dtype=dtypes.int32, shape=[])
+    handle = _eager_safe_var_handle_op(dtype=dtypes.int32, shape=[])
     id_handle = array_ops.identity(handle)
     self.evaluate(resource_variable_ops.assign_variable_op(
         id_handle, constant_op.constant(0, dtype=dtypes.int32)))
@@ -265,7 +284,7 @@ def testUnreadOpName(self):
 
   @test_util.run_in_graph_and_eager_modes
   def testCreateRead(self):
-    handle = resource_variable_ops.var_handle_op(dtype=dtypes.int32, shape=[])
+    handle = _eager_safe_var_handle_op(dtype=dtypes.int32, shape=[])
     self.evaluate(resource_variable_ops.assign_variable_op(
         handle, constant_op.constant(1, dtype=dtypes.int32)))
     value = self.evaluate(
@@ -274,7 +293,7 @@ def testCreateRead(self):
 
   @test_util.run_in_graph_and_eager_modes
   def testManyAssigns(self):
-    handle = resource_variable_ops.var_handle_op(dtype=dtypes.int32, shape=[])
+    handle = _eager_safe_var_handle_op(dtype=dtypes.int32, shape=[])
     create = resource_variable_ops.assign_variable_op(
         handle, constant_op.constant(1, dtype=dtypes.int32))
     with ops.control_dependencies([create]):
@@ -292,7 +311,7 @@ def testManyAssigns(self):
 
   @test_util.run_in_graph_and_eager_modes
   def testAssignAdd(self):
-    handle = resource_variable_ops.var_handle_op(dtype=dtypes.int32, shape=[])
+    handle = _eager_safe_var_handle_op(dtype=dtypes.int32, shape=[])
     self.evaluate(resource_variable_ops.assign_variable_op(
         handle, constant_op.constant(1, dtype=dtypes.int32)))
     self.evaluate(resource_variable_ops.assign_add_variable_op(
@@ -303,8 +322,7 @@ def testAssignAdd(self):
 
   @test_util.run_in_graph_and_eager_modes
   def testScatterAdd(self):
-    handle = resource_variable_ops.var_handle_op(
-        dtype=dtypes.int32, shape=[1, 1])
+    handle = _eager_safe_var_handle_op(dtype=dtypes.int32, shape=[1, 1])
     self.evaluate(
         resource_variable_ops.assign_variable_op(
             handle, constant_op.constant([[1]], dtype=dtypes.int32)))
@@ -384,8 +402,7 @@ def testGradientGatherNdIndexedSlices(self):
 
   @test_util.run_in_graph_and_eager_modes
   def testScatterSub(self):
-    handle = resource_variable_ops.var_handle_op(
-        dtype=dtypes.int32, shape=[1, 1])
+    handle = _eager_safe_var_handle_op(dtype=dtypes.int32, shape=[1, 1])
     self.evaluate(
         resource_variable_ops.assign_variable_op(
             handle, constant_op.constant([[1]], dtype=dtypes.int32)))
@@ -397,8 +414,7 @@ def testScatterSub(self):
 
   @test_util.run_in_graph_and_eager_modes
   def testScatterMul(self):
-    handle = resource_variable_ops.var_handle_op(
-        dtype=dtypes.int32, shape=[1, 1])
+    handle = _eager_safe_var_handle_op(dtype=dtypes.int32, shape=[1, 1])
     self.evaluate(
         resource_variable_ops.assign_variable_op(
             handle, constant_op.constant([[1]], dtype=dtypes.int32)))
@@ -429,8 +445,7 @@ def testEagerPickle(self):
 
   @test_util.run_in_graph_and_eager_modes
   def testScatterDiv(self):
-    handle = resource_variable_ops.var_handle_op(
-        dtype=dtypes.int32, shape=[1, 1])
+    handle = _eager_safe_var_handle_op(dtype=dtypes.int32, shape=[1, 1])
     self.evaluate(
         resource_variable_ops.assign_variable_op(
             handle, constant_op.constant([[6]], dtype=dtypes.int32)))
@@ -452,8 +467,7 @@ def testEagerNoUseResource(self):
   @test_util.run_in_graph_and_eager_modes
   def testScatterMin(self):
     with ops.device("cpu:0"):
-      handle = resource_variable_ops.var_handle_op(
-          dtype=dtypes.int32, shape=[1, 1])
+      handle = _eager_safe_var_handle_op(dtype=dtypes.int32, shape=[1, 1])
       self.evaluate(
           resource_variable_ops.assign_variable_op(handle,
                                                    constant_op.constant(
@@ -488,8 +502,7 @@ def testMetagraph(self):
 
   @test_util.run_in_graph_and_eager_modes
   def testScatterMax(self):
-    handle = resource_variable_ops.var_handle_op(
-        dtype=dtypes.int32, shape=[1, 1])
+    handle = _eager_safe_var_handle_op(dtype=dtypes.int32, shape=[1, 1])
     self.evaluate(
         resource_variable_ops.assign_variable_op(
             handle, constant_op.constant([[6]], dtype=dtypes.int32)))
@@ -501,8 +514,7 @@ def testScatterMax(self):
 
   @test_util.run_in_graph_and_eager_modes
   def testScatterAddScalar(self):
-    handle = resource_variable_ops.var_handle_op(
-        dtype=dtypes.int32, shape=[1, 1])
+    handle = _eager_safe_var_handle_op(dtype=dtypes.int32, shape=[1, 1])
     self.evaluate(
         resource_variable_ops.assign_variable_op(
             handle, constant_op.constant([[1]], dtype=dtypes.int32)))
@@ -514,8 +526,7 @@ def testScatterAddScalar(self):
 
   @test_util.run_in_graph_and_eager_modes
   def testScatterSubScalar(self):
-    handle = resource_variable_ops.var_handle_op(
-        dtype=dtypes.int32, shape=[1, 1])
+    handle = _eager_safe_var_handle_op(dtype=dtypes.int32, shape=[1, 1])
     self.evaluate(
         resource_variable_ops.assign_variable_op(
             handle, constant_op.constant([[1]], dtype=dtypes.int32)))
@@ -527,8 +538,7 @@ def testScatterSubScalar(self):
 
   @test_util.run_in_graph_and_eager_modes
   def testScatterMulScalar(self):
-    handle = resource_variable_ops.var_handle_op(
-        dtype=dtypes.int32, shape=[1, 1])
+    handle = _eager_safe_var_handle_op(dtype=dtypes.int32, shape=[1, 1])
     self.evaluate(
         resource_variable_ops.assign_variable_op(
             handle, constant_op.constant([[1]], dtype=dtypes.int32)))
@@ -540,8 +550,7 @@ def testScatterMulScalar(self):
 
   @test_util.run_in_graph_and_eager_modes
   def testScatterDivScalar(self):
-    handle = resource_variable_ops.var_handle_op(
-        dtype=dtypes.int32, shape=[1, 1])
+    handle = _eager_safe_var_handle_op(dtype=dtypes.int32, shape=[1, 1])
     self.evaluate(
         resource_variable_ops.assign_variable_op(
             handle, constant_op.constant([[6]], dtype=dtypes.int32)))
@@ -553,8 +562,7 @@ def testScatterDivScalar(self):
 
   @test_util.run_in_graph_and_eager_modes
   def testScatterMinScalar(self):
-    handle = resource_variable_ops.var_handle_op(
-        dtype=dtypes.int32, shape=[1, 1])
+    handle = _eager_safe_var_handle_op(dtype=dtypes.int32, shape=[1, 1])
     self.evaluate(
         resource_variable_ops.assign_variable_op(
             handle, constant_op.constant([[6]], dtype=dtypes.int32)))
@@ -566,8 +574,7 @@ def testScatterMinScalar(self):
 
   @test_util.run_in_graph_and_eager_modes
   def testScatterMaxScalar(self):
-    handle = resource_variable_ops.var_handle_op(
-        dtype=dtypes.int32, shape=[1, 1])
+    handle = _eager_safe_var_handle_op(dtype=dtypes.int32, shape=[1, 1])
     self.evaluate(
         resource_variable_ops.assign_variable_op(
             handle, constant_op.constant([[6]], dtype=dtypes.int32)))
@@ -690,8 +697,7 @@ def testScatterUpdateVariableMethod(self, dtype):
 
   @test_util.run_deprecated_v1
   def testScatterUpdateString(self):
-    handle = resource_variable_ops.var_handle_op(
-        dtype=dtypes.string, shape=[1, 1])
+    handle = _eager_safe_var_handle_op(dtype=dtypes.string, shape=[1, 1])
     self.evaluate(resource_variable_ops.assign_variable_op(
         handle, constant_op.constant([["a"]], dtype=dtypes.string)))
     self.evaluate(resource_variable_ops.resource_scatter_update(
@@ -702,8 +708,7 @@ def testScatterUpdateString(self):
 
   @test_util.run_deprecated_v1
   def testScatterUpdateStringScalar(self):
-    handle = resource_variable_ops.var_handle_op(
-        dtype=dtypes.string, shape=[1, 1])
+    handle = _eager_safe_var_handle_op(dtype=dtypes.string, shape=[1, 1])
     self.evaluate(
         resource_variable_ops.assign_variable_op(handle,
                                                  constant_op.constant(
@@ -1016,7 +1021,7 @@ def testDestroyResource(self):
       with self.assertRaises(errors.FailedPreconditionError):
         self.evaluate(v.value())
     # Handle to a resource not actually created.
-    handle = resource_variable_ops.var_handle_op(dtype=dtypes.int32, shape=[])
+    handle = _eager_safe_var_handle_op(dtype=dtypes.int32, shape=[])
     # Should raise no exception
     self.evaluate(resource_variable_ops.destroy_resource_op(
         handle, ignore_lookup_error=True))
@@ -1136,15 +1141,19 @@ def testSharedName(self):
       v = resource_variable_ops.ResourceVariable(300.0, name="var4")
       self.evaluate(variables.global_variables_initializer())
 
-      w = resource_variable_ops.var_handle_op(
-          dtype=v.dtype.base_dtype, shape=v.get_shape(), shared_name="var4",
+      w = _eager_safe_var_handle_op(
+          dtype=v.dtype.base_dtype,
+          shape=v.get_shape(),
+          shared_name="var4",
           # Needed in Eager since we get a unique container name by default.
           container=ops.get_default_graph()._container)
       w_read = resource_variable_ops.read_variable_op(w, v.dtype.base_dtype)
       self.assertEqual(300.0, self.evaluate(w_read))
 
-      x = resource_variable_ops.var_handle_op(
-          dtype=v.dtype.base_dtype, shape=v.get_shape(), shared_name="var5",
+      x = _eager_safe_var_handle_op(
+          dtype=v.dtype.base_dtype,
+          shape=v.get_shape(),
+          shared_name="var5",
           container=ops.get_default_graph()._container)
       with self.assertRaisesOpError(
           "(Resource .*/var5/.* does not exist|uninitialized)"):
@@ -1159,8 +1168,10 @@ def testSharedNameWithNamescope(self):
         self.assertEqual("foo/var6:0", v.name)
         self.evaluate(variables.global_variables_initializer())
 
-      w = resource_variable_ops.var_handle_op(
-          dtype=v.dtype.base_dtype, shape=v.get_shape(), shared_name="foo/var6",
+      w = _eager_safe_var_handle_op(
+          dtype=v.dtype.base_dtype,
+          shape=v.get_shape(),
+          shared_name="foo/var6",
           # Needed in Eager since we get a unique container name by default.
           container=ops.get_default_graph()._container)
       w_read = resource_variable_ops.read_variable_op(w, v.dtype.base_dtype)
@@ -1366,11 +1377,14 @@ def testScatterUpdateCast(self):
   def testScatterUpdateInvalidArgs(self):
     v = resource_variable_ops.ResourceVariable([0, 1, 2, 3], name="update")
     # The exact error and message differ between graph construction (where the
-    # error is realized during shape inference at graph construction time) and
-    # eager execution (where the error is realized during kernel execution).
-    with self.assertRaisesRegex(Exception, r"shape.*2.*3"):
+    # error is realized during shape inference at graph construction time),
+    # eager execution (where the error is realized during kernel execution),
+    # and XLA auto-clustering execution (where the error is realized in the xla
+    # op kernel) which is triggered when running in eager op as function mode.
+    with self.assertRaisesRegex(Exception, r"shape.*2.*3|RET_CHECK failure"):
       state_ops.scatter_update(v, [0, 1], [0, 1, 2])
 
+  @test_util.disable_xla("b/208334252")  # XLA doesn't have a deterministic impl
   def testScatterAddDeterministic(self):
     with context.eager_mode(), test_util.deterministic_ops():
       # Normally a nondeterministic codepath occurs when the variable has at
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index cb4333eb713e91..111b54b3f96ef6 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -2085,7 +2085,7 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
 
   See also `tf.unstack`.
 
-  If `num_or_size_splits` is an integer,  then `value` is split along the
+  If `num_or_size_splits` is an `int`,  then it splits `value` along the
   dimension `axis` into `num_or_size_splits` smaller tensors. This requires that
   `value.shape[axis]` is divisible by `num_or_size_splits`.
 
@@ -2114,25 +2114,27 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
 
   Args:
     value: The `Tensor` to split.
-    num_or_size_splits: Either an integer indicating the number of splits along
-      `axis` or a 1-D integer `Tensor` or Python list containing the sizes of
-      each output tensor along `axis`. If a scalar, then it must evenly divide
-      `value.shape[axis]`; otherwise the sum of sizes along the split axis
-      must match that of the `value`.
-    axis: An integer or scalar `int32` `Tensor`. The dimension along which to
-      split. Must be in the range `[-rank(value), rank(value))`. Defaults to 0.
-    num: Optional, used to specify the number of outputs when it cannot be
-      inferred from the shape of `size_splits`.
+    num_or_size_splits: Either an `int` indicating the number of splits
+      along `axis` or a 1-D integer `Tensor` or Python list containing the sizes
+      of each output tensor along `axis`. If an `int`, then it must evenly
+      divide `value.shape[axis]`; otherwise the sum of sizes along the split
+      axis must match that of the `value`.
+    axis: An `int` or scalar `int32` `Tensor`. The dimension along which
+      to split. Must be in the range `[-rank(value), rank(value))`. Defaults to
+      0.
+    num: Optional, an `int`, used to specify the number of outputs when it
+      cannot be inferred from the shape of `size_splits`.
     name: A name for the operation (optional).
 
   Returns:
-    if `num_or_size_splits` is a scalar returns a list of `num_or_size_splits`
-    `Tensor` objects; if `num_or_size_splits` is a 1-D Tensor returns
-    `num_or_size_splits.get_shape[0]` `Tensor` objects resulting from splitting
-    `value`.
+    if `num_or_size_splits` is an `int` returns a list of
+    `num_or_size_splits` `Tensor` objects; if `num_or_size_splits` is a 1-D
+    list or 1-D `Tensor` returns `num_or_size_splits.get_shape[0]`
+    `Tensor` objects resulting from splitting `value`.
 
   Raises:
     ValueError: If `num` is unspecified and cannot be inferred.
+    ValueError: If `num_or_size_splits` is a scalar `Tensor`.
   """
   if isinstance(num_or_size_splits,
                 (numbers.Integral, tensor_shape.Dimension)):
@@ -2152,7 +2154,7 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
       num = size_splits_shape[0]
     if num is None:
       raise ValueError(
-          "Cannot infer argument `num` from shape {num_or_size_splits}")
+          f"Cannot infer argument `num` from shape {num_or_size_splits}")
 
   return gen_array_ops.split_v(
       value=value, size_splits=size_splits, axis=axis, num_split=num, name=name)
@@ -4676,79 +4678,152 @@ def where(condition, x=None, y=None, name=None):
 @tf_export("where", v1=["where_v2"])
 @dispatch.add_dispatch_support
 def where_v2(condition, x=None, y=None, name=None):
-  """Return the elements where `condition` is `True` (multiplexing `x` and `y`).
+  """Returns the indices of non-zero elements, or multiplexes `x` and `y`.
 
-  This operator has two modes: in one mode both `x` and `y` are provided, in
-  another mode neither are provided. `condition` is always expected to be a
-  `tf.Tensor` of type `bool`.
+  This operation has two modes:
 
-  #### Retrieving indices of `True` elements
+  1. **Return the indices of non-zero elements** - When only
+     `condition` is provided the result is an `int64` tensor where each row is
+     the index of a non-zero element of `condition`. The result's shape
+     is `[tf.math.count_nonzero(condition), tf.rank(condition)]`.
+  2. **Multiplex `x` and `y`** - When both `x` and `y` are provided the
+     result has the shape of `x`, `y`, and `condition` broadcast together. The
+     result is taken from `x` where `condition` is non-zero
+     or `y` where `condition` is zero.
+
+  #### 1. Return the indices of non-zero elements
+
+  Note: In this mode `condition` can have a dtype of `bool` or any numeric
+  dtype.
 
   If `x` and `y` are not provided (both are None):
 
-  `tf.where` will return the indices of `condition` that are `True`, in
-  the form of a 2-D tensor with shape (n, d).
-  (Where n is the number of matching indices in `condition`,
-  and d is the number of dimensions in `condition`).
+  `tf.where` will return the indices of `condition` that are non-zero,
+  in the form of a 2-D tensor with shape `[n, d]`, where `n` is the number of
+  non-zero elements in `condition` (`tf.count_nonzero(condition)`), and `d` is
+  the number of axes of `condition` (`tf.rank(condition)`).
+
+  Indices are output in row-major order. The `condition` can have a `dtype` of
+  `tf.bool`, or any numeric `dtype`.
 
-  Indices are output in row-major order.
+  Here `condition` is a 1-axis `bool` tensor with 2 `True` values. The result
+  has a shape of `[2,1]`
 
-  >>> tf.where([True, False, False, True])
-  <tf.Tensor: shape=(2, 1), dtype=int64, numpy=
+  >>> tf.where([True, False, False, True]).numpy()
   array([[0],
-         [3]])>
+         [3]])
 
-  >>> tf.where([[True, False], [False, True]])
-  <tf.Tensor: shape=(2, 2), dtype=int64, numpy=
+  Here `condition` is a 2-axis integer tensor, with 3 non-zero values. The
+  result has a shape of `[3, 2]`.
+
+  >>> tf.where([[1, 0, 0], [1, 0, 1]]).numpy()
   array([[0, 0],
-         [1, 1]])>
+         [1, 0],
+         [1, 2]])
+
+  Here `condition` is a 3-axis float tensor, with 5 non-zero values. The output
+  shape is `[5, 3]`.
 
-  >>> tf.where([[[True, False], [False, True], [True, True]]])
-  <tf.Tensor: shape=(4, 3), dtype=int64, numpy=
+  >>> float_tensor = [[[0.1, 0], [0, 2.2], [3.5, 1e6]],
+  ...                 [[0,   0], [0,   0], [99,    0]]]
+  >>> tf.where(float_tensor).numpy()
   array([[0, 0, 0],
          [0, 1, 1],
          [0, 2, 0],
-         [0, 2, 1]])>
+         [0, 2, 1],
+         [1, 2, 0]])
 
-  #### Multiplexing between `x` and `y`
+  These indices are the same that `tf.sparse.SparseTensor` would use to
+  represent the condition tensor:
 
-  If `x` and `y` are provided (both have non-None values):
+  >>> sparse = tf.sparse.from_dense(float_tensor)
+  >>> sparse.indices.numpy()
+  array([[0, 0, 0],
+         [0, 1, 1],
+         [0, 2, 0],
+         [0, 2, 1],
+         [1, 2, 0]])
 
-  `tf.where` will choose an output shape from the shapes of `condition`, `x`,
-  and `y` that all three shapes are
-  [broadcastable](https://docs.scipy.org/doc/numpy/reference/ufuncs.html) to.
+  A complex number is considered non-zero if either the real or imaginary
+  component is non-zero:
 
-  The `condition` tensor acts as a mask that chooses whether the corresponding
-  element / row in the output should be taken from `x`
-  (if the element in `condition` is True) or `y` (if it is false).
+  >>> tf.where([complex(0.), complex(1.), 0+1j, 1+1j]).numpy()
+  array([[1],
+         [2],
+         [3]])
 
-  >>> tf.where([True, False, False, True], [1,2,3,4], [100,200,300,400])
-  <tf.Tensor: shape=(4,), dtype=int32, numpy=array([  1, 200, 300,   4],
-  dtype=int32)>
-  >>> tf.where([True, False, False, True], [1,2,3,4], [100])
-  <tf.Tensor: shape=(4,), dtype=int32, numpy=array([  1, 100, 100,   4],
-  dtype=int32)>
-  >>> tf.where([True, False, False, True], [1,2,3,4], 100)
-  <tf.Tensor: shape=(4,), dtype=int32, numpy=array([  1, 100, 100,   4],
-  dtype=int32)>
-  >>> tf.where([True, False, False, True], 1, 100)
-  <tf.Tensor: shape=(4,), dtype=int32, numpy=array([  1, 100, 100,   1],
-  dtype=int32)>
+  #### 2. Multiplex `x` and `y`
 
-  >>> tf.where(True, [1,2,3,4], 100)
-  <tf.Tensor: shape=(4,), dtype=int32, numpy=array([1, 2, 3, 4],
-  dtype=int32)>
-  >>> tf.where(False, [1,2,3,4], 100)
-  <tf.Tensor: shape=(4,), dtype=int32, numpy=array([100, 100, 100, 100],
-  dtype=int32)>
+  Note: In this mode `condition` must have a dtype of `bool`.
 
-  Note that if the gradient of either branch of the tf.where generates
-  a NaN, then the gradient of the entire tf.where will be NaN. This is because
-  the gradient calculation for tf.where combines the two branches, for
+  If `x` and `y` are also provided (both have non-None values) the `condition`
+  tensor acts as a mask that chooses whether the corresponding
+  element / row in the output should be taken from `x` (if the element in
+  `condition` is `True`) or `y` (if it is `False`).
+
+  The shape of the result is formed by
+  [broadcasting](https://docs.scipy.org/doc/numpy/reference/ufuncs.html)
+  together the shapes of `condition`, `x`, and `y`.
+
+  When all three inputs have the same size, each is handled element-wise.
+
+  >>> tf.where([True, False, False, True],
+  ...          [1, 2, 3, 4],
+  ...          [100, 200, 300, 400]).numpy()
+  array([  1, 200, 300,   4], dtype=int32)
+
+  There are two main rules for broadcasting:
+
+  1. If a tensor has fewer axes than the others, length-1 axes are added to the
+     left of the shape.
+  2. Axes with length-1 are streched to match the coresponding axes of the other
+     tensors.
+
+  A length-1 vector is streched to match the other vectors:
+
+  >>> tf.where([True, False, False, True], [1, 2, 3, 4], [100]).numpy()
+  array([  1, 100, 100,   4], dtype=int32)
+
+  A scalar is expanded to match the other arguments:
+
+  >>> tf.where([[True, False], [False, True]], [[1, 2], [3, 4]], 100).numpy()
+  array([[  1, 100], [100,   4]], dtype=int32)
+  >>> tf.where([[True, False], [False, True]], 1, 100).numpy()
+  array([[  1, 100], [100,   1]], dtype=int32)
+
+  A scalar `condition` returns the complete `x` or `y` tensor, with
+  broadcasting applied.
+
+  >>> tf.where(True, [1, 2, 3, 4], 100).numpy()
+  array([1, 2, 3, 4], dtype=int32)
+  >>> tf.where(False, [1, 2, 3, 4], 100).numpy()
+  array([100, 100, 100, 100], dtype=int32)
+
+  For a non-trivial example of broadcasting, here `condition` has a shape of
+  `[3]`, `x` has a shape of `[3,3]`, and `y` has a shape of `[3,1]`.
+  Broadcasting first expands the shape of `condition` to `[1,3]`. The final
+  broadcast shape is `[3,3]`. `condition` will select columns from `x` and `y`.
+  Since `y` only has one column, all columns from `y` will be identical.
+
+  >>> tf.where([True, False, True],
+  ...          x=[[1, 2, 3],
+  ...             [4, 5, 6],
+  ...             [7, 8, 9]],
+  ...          y=[[100],
+  ...             [200],
+  ...             [300]]
+  ... ).numpy()
+  array([[ 1, 100, 3],
+         [ 4, 200, 6],
+         [ 7, 300, 9]], dtype=int32)
+
+  Note that if the gradient of either branch of the `tf.where` generates
+  a `NaN`, then the gradient of the entire `tf.where` will be `NaN`. This is
+  because the gradient calculation for `tf.where` combines the two branches, for
   performance reasons.
 
-  A workaround is to use an inner tf.where to ensure the function has
-  no asymptote, and to avoid computing a value whose gradient is NaN by
+  A workaround is to use an inner `tf.where` to ensure the function has
+  no asymptote, and to avoid computing a value whose gradient is `NaN` by
   replacing dangerous inputs with safe inputs.
 
   Instead of this,
@@ -4760,8 +4835,8 @@ def where_v2(condition, x=None, y=None, name=None):
   >>> print(tape.gradient(y, x))
   tf.Tensor(nan, shape=(), dtype=float32)
 
-  Although, the `1. / x` values are never used, its gradient is a NaN when x =
-  0. Instead, we should guard that with another `tf.where`
+  Although, the `1. / x` values are never used, its gradient is a `NaN` when
+  `x = 0`. Instead, we should guard that with another `tf.where`
 
   >>> x = tf.constant(0., dtype=tf.float32)
   >>> with tf.GradientTape() as tape:
@@ -4771,8 +4846,19 @@ def where_v2(condition, x=None, y=None, name=None):
   >>> print(tape.gradient(y, x))
   tf.Tensor(0.0, shape=(), dtype=float32)
 
+  See also:
+
+  * `tf.sparse` - The indices returned by the first form of `tf.where` can be
+     useful in `tf.sparse.SparseTensor` objects.
+  * `tf.gather_nd`, `tf.scatter_nd`, and related ops - Given the
+    list of indices returned from `tf.where` the `scatter` and `gather` family
+    of ops can be used fetch values or insert values at those indices.
+  * `tf.strings.length` - `tf.string` is not an allowed dtype for the
+    `condition`. Use the string length instead.
+
   Args:
-    condition: A `tf.Tensor` of type `bool`
+    condition: A `tf.Tensor` of dtype bool, or any numeric dtype. `condition`
+      must have dtype `bool` when `x` and `y` are provided.
     x: If provided, a Tensor which is of the same type as `y`, and has a shape
       broadcastable with `condition` and `y`.
     y: If provided, a Tensor which is of the same type as `x`, and has a shape
@@ -4783,7 +4869,8 @@ def where_v2(condition, x=None, y=None, name=None):
     If `x` and `y` are provided:
       A `Tensor` with the same type as `x` and `y`, and shape that
       is broadcast from `condition`, `x`, and `y`.
-    Otherwise, a `Tensor` with shape `(num_true, dim_size(condition))`.
+    Otherwise, a `Tensor` with shape `[tf.math.count_nonzero(condition),
+    tf.rank(condition)]`.
 
   Raises:
     ValueError: When exactly one of `x` or `y` is non-None, or the shapes
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index a287088b8522c9..05bf530452ec3b 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -40,6 +40,7 @@
 from tensorflow.python.training.saver import BaseSaverBuilder
 # pylint: enable=wildcard-import
 from tensorflow.python.training.tracking import base as trackable_base
+from tensorflow.python.training.tracking import resource
 from tensorflow.python.training.tracking import tracking as trackable
 from tensorflow.python.util import compat as compat_util
 from tensorflow.python.util.deprecation import deprecated
@@ -125,7 +126,7 @@ def check_table_dtypes(table, key_dtype, value_dtype):
                     f"{table.value_dtype} but got {value_dtype}.")
 
 
-class LookupInterface(trackable.TrackableResource):
+class LookupInterface(resource.TrackableResource):
   """Represent a lookup table that persists across different steps."""
 
   def __init__(self, key_dtype, value_dtype):
@@ -396,22 +397,15 @@ def _add_trackable_child(self, name, value):
       self._track_trackable(value, name)  # pylint:disable=protected-access
 
   @classmethod
-  def _deserialize_from_proto(cls, proto, **unused_kwargs):
+  def _deserialize_from_proto(cls, **kwargs):
 
-    from tensorflow.python.saved_model import load  # pylint: disable=g-import-not-at-top
-
-    class _RestoredStaticHashTable(load._RestoredResource):  # pylint: disable=protected-access
+    class _RestoredStaticHashTable(resource.RestoredResource):  # pylint: disable=protected-access
 
       @classmethod
       def _resource_type(cls):
         return "RestoredStaticHashTable"
 
-      def _add_trackable_child(self, name, value):
-        setattr(self, name, value)
-        if isinstance(value, trackable_base.Trackable):
-          self._track_trackable(value, name)  # pylint:disable=protected-access
-
-    return _RestoredStaticHashTable()
+    return _RestoredStaticHashTable._deserialize_from_proto(**kwargs)  # pylint: disable=protected-access
 
 
 @tf_export(v1=["lookup.StaticHashTable"])
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index 4d9fe3102223ca..d9a0c791d79220 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -351,6 +351,91 @@ def mean(values,
       or if either `metrics_collections` or `updates_collections` are not a list
       or tuple.
     RuntimeError: If eager execution is enabled.
+
+  @compatibility(TF2)
+  `tf.compat.v1.metrics.mean` is not compatible with eager
+  execution or `tf.function`.
+  Please use `tf.keras.metrics.Mean` instead for TF2 migration. After
+  instantiating a `tf.keras.metrics.Mean` object, you can first call the
+  `update_state()` method to record the new values, and then call the
+  `result()` method to get the mean eagerly. You can also attach it to a
+  Keras model with the `add_metric` method.  Please refer to the [migration
+  guide](https://www.tensorflow.org/guide/migrate#new-style_metrics_and_losses)
+  for more details.
+
+  #### Structural Mapping to TF2
+
+  Before:
+
+  ```python
+  mean, update_op = tf.compat.v1.metrics.mean(
+    values=values,
+    weights=weights,
+    metrics_collections=metrics_collections,
+    update_collections=update_collections,
+    name=name)
+  ```
+
+  After:
+
+  ```python
+   m = tf.keras.metrics.Mean(
+     name=name)
+
+   m.update_state(
+     values=values,
+     sample_weight=weights)
+
+   mean = m.result()
+  ```
+
+  #### How to Map Arguments
+
+  | TF1 Arg Name          | TF2 Arg Name    | Note                       |
+  | :-------------------- | :-------------- | :------------------------- |
+  | `values`              | `values`        | In `update_state()` method |
+  | `weights`             | `sample_weight` | In `update_state()` method |
+  | `metrics_collections` | Not supported   | Metrics should be tracked  |
+  :                       :                 : explicitly or with Keras   :
+  :                       :                 : APIs, for example,         :
+  :                       :                 : [add_metric][add_metric],  :
+  :                       :                 : instead of via collections :
+  | `updates_collections` | Not supported   | -                          |
+  | `name`                | `name`          | In constructor             |
+
+  [add_metric]:https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer#add_metric
+
+
+  #### Before & After Usage Example
+
+  Before:
+
+  >>> g = tf.Graph()
+  >>> with g.as_default():
+  ...   values = [1, 2, 3]
+  ...   mean, update_op = tf.compat.v1.metrics.mean(values)
+  ...   global_init = tf.compat.v1.global_variables_initializer()
+  ...   local_init = tf.compat.v1.local_variables_initializer()
+  >>> sess = tf.compat.v1.Session(graph=g)
+  >>> sess.run([global_init, local_init])
+  >>> sess.run(update_op)
+  >>> sess.run(mean)
+  2.0
+
+
+  After:
+
+  >>> m = tf.keras.metrics.Mean()
+  >>> m.update_state([1, 2, 3])
+  >>> m.result().numpy()
+  2.0
+
+  ```python
+  # Used within Keras model
+  model.add_metric(tf.keras.metrics.Mean()(values))
+  ```
+
+  @end_compatibility
   """
   if context.executing_eagerly():
     raise RuntimeError('tf.metrics.mean is not supported when eager execution '
@@ -495,7 +580,7 @@ def accuracy(labels,
   | `updates_collections` | Not supported   | -                          |
   | `name`                | `name`          | In constructor             |
 
-  [add_metric]:https//www.tensorflow.org/api_docs/python/tf/keras/layers/Layer#add_metric
+  [add_metric]:https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer#add_metric
 
 
   #### Before & After Usage Example
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_deterministic_test.py b/tensorflow/python/ops/nn_fused_batchnorm_deterministic_test.py
index 9e063b07edf658..4b0305b0003b6b 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_deterministic_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_deterministic_test.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Functional tests for fused batch-norm related to determinism."""
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.eager import backprop
@@ -29,7 +30,8 @@
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class FusedBatchNormalizationDeterministicTest(test.TestCase):
+class FusedBatchNormalizationDeterministicTest(test.TestCase,
+                                               parameterized.TestCase):
   """Test determinsitic functionality and exceptions for FusedBatchNorm.
 
   Test that tf.errors.UnimplementedError is thrown, as
@@ -71,15 +73,30 @@ def _genParams(self, data_format, x_dtype, large_batch):
         np.random.normal(size=y_shape), dtype=y_dtype)
     return x, scale, offset, mean, variance, upstream_gradients
 
-  def testForward(self):
+  @parameterized.parameters('NHWC', 'NCHW')
+  def testForward(self, data_format):
     with self.cached_session():
-      for data_format in ['NHWC', 'NCHW']:
-        for large_batch in [False, True]:
-          for x_dtype in [dtypes.float16, dtypes.float32]:  # skipping bfloat16
-            x, scale, offset, mean, variance, _ = self._genParams(
-                data_format, x_dtype, large_batch)
-            for is_training in [False, True]:
-              op_output = nn_impl.fused_batch_norm(
+      for large_batch in [False, True]:
+        for x_dtype in [dtypes.float16, dtypes.float32]:  # skipping bfloat16
+          x, scale, offset, mean, variance, _ = self._genParams(
+              data_format, x_dtype, large_batch)
+          for is_training in [False, True]:
+            op_output = nn_impl.fused_batch_norm(
+                x,
+                scale,
+                offset,
+                mean,
+                variance,
+                data_format=data_format,
+                is_training=is_training,
+                exponential_avg_factor=1.01)
+            y_a, running_mean_a, running_var_a = op_output
+            y_a = self.evaluate(y_a)
+            if is_training:
+              running_mean_a = self.evaluate(running_mean_a)
+              running_var_a = self.evaluate(running_var_a)
+            for _ in range(5):
+              op_output_b = nn_impl.fused_batch_norm(
                   x,
                   scale,
                   offset,
@@ -88,72 +105,57 @@ def testForward(self):
                   data_format=data_format,
                   is_training=is_training,
                   exponential_avg_factor=1.01)
-              y_a, running_mean_a, running_var_a = op_output
-              y_a = self.evaluate(y_a)
+              y_b, running_mean_b, running_var_b = op_output_b
+              y_b = self.evaluate(y_b)
+              self.assertAllEqual(y_a, y_b)
               if is_training:
-                running_mean_a = self.evaluate(running_mean_a)
-                running_var_a = self.evaluate(running_var_a)
-              for _ in range(5):
-                op_output_b = nn_impl.fused_batch_norm(
-                    x,
-                    scale,
-                    offset,
-                    mean,
-                    variance,
-                    data_format=data_format,
-                    is_training=is_training,
-                    exponential_avg_factor=1.01)
-                y_b, running_mean_b, running_var_b = op_output_b
-                y_b = self.evaluate(y_b)
-                self.assertAllEqual(y_a, y_b)
-                if is_training:
-                  running_mean_b = self.evaluate(running_mean_b)
-                  running_var_b = self.evaluate(running_var_b)
-                  self.assertAllEqual(running_mean_a, running_mean_b)
-                  self.assertAllEqual(running_var_a, running_var_b)
+                running_mean_b = self.evaluate(running_mean_b)
+                running_var_b = self.evaluate(running_var_b)
+                self.assertAllEqual(running_mean_a, running_mean_b)
+                self.assertAllEqual(running_var_a, running_var_b)
 
+  @parameterized.parameters('NHWC', 'NCHW')
   @test_util.disable_xla('XLA is deterministic')
-  def testBackward(self):
+  def testBackward(self, data_format):
     with self.cached_session():
-      for data_format in ['NHWC', 'NCHW']:
-        for large_batch in [False, True]:
-          for x_dtype in [dtypes.float16, dtypes.float32]:  # skipping bfloat16
-            params = self._genParams(data_format, x_dtype, large_batch)
-            x, scale, offset, mean, variance, upstream_gradients = params
-            for is_training in [False, True]:
-              for backprop_to in [x, scale, offset]:
-                with backprop.GradientTape(persistent=True) as tape:
-                  tape.watch(backprop_to)
-                  op_output = nn_impl.fused_batch_norm(
-                      x,
-                      scale,
-                      offset,
-                      mean,
-                      variance,
-                      data_format=data_format,
-                      is_training=is_training,
-                      exponential_avg_factor=0.99)
-                  gradient_injector_output = op_output[0] * upstream_gradients
-                if (len(config.list_physical_devices('GPU')) and
-                    not is_training):
-                  # Only backprop to offset is nondeterministic (on GPU, when
-                  # is_training=False), but backprop to the other parameters is
-                  # calculated using the same kernel.
-                  with self.assertRaisesRegex(
-                      errors_impl.UnimplementedError,
-                      'A deterministic GPU implementation of fused batch-norm' +
-                      ' backprop, when training is disabled, is not currently' +
-                      ' available.'):
-                    grad = tape.gradient(gradient_injector_output, backprop_to)
-                    self.evaluate(grad)
-                else:
-                  grad_a = tape.gradient(gradient_injector_output, backprop_to)
-                  grad_a = self.evaluate(grad_a)
-                  for _ in range(5):
-                    grad_b = tape.gradient(gradient_injector_output,
-                                           backprop_to)
-                    grad_b = self.evaluate(grad_b)
-                    self.assertAllEqual(grad_a, grad_b)
+      for large_batch in [False, True]:
+        # Only run with float32, as float16 is very slow on CPUs
+        params = self._genParams(data_format, dtypes.float32, large_batch)
+        x, scale, offset, mean, variance, upstream_gradients = params
+        for is_training in [False, True]:
+          for backprop_to in [x, scale, offset]:
+            with backprop.GradientTape(persistent=True) as tape:
+              tape.watch(backprop_to)
+              op_output = nn_impl.fused_batch_norm(
+                  x,
+                  scale,
+                  offset,
+                  mean,
+                  variance,
+                  data_format=data_format,
+                  is_training=is_training,
+                  exponential_avg_factor=0.99)
+              gradient_injector_output = op_output[0] * upstream_gradients
+            if (len(config.list_physical_devices('GPU')) and
+                not is_training):
+              # Only backprop to offset is nondeterministic (on GPU, when
+              # is_training=False), but backprop to the other parameters is
+              # calculated using the same kernel.
+              with self.assertRaisesRegex(
+                  errors_impl.UnimplementedError,
+                  'A deterministic GPU implementation of fused batch-norm' +
+                  ' backprop, when training is disabled, is not currently' +
+                  ' available.'):
+                grad = tape.gradient(gradient_injector_output, backprop_to)
+                self.evaluate(grad)
+            else:
+              grad_a = tape.gradient(gradient_injector_output, backprop_to)
+              grad_a = self.evaluate(grad_a)
+              for _ in range(3):
+                grad_b = tape.gradient(gradient_injector_output,
+                                       backprop_to)
+                grad_b = self.evaluate(grad_b)
+                self.assertAllEqual(grad_a, grad_b)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index d81778cc2743c8..89c7e1abf64351 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -1196,9 +1196,10 @@ py_test(
 
 py_test(
     name = "ragged_shape_test",
-    size = "large",
+    size = "medium",
     srcs = ["ragged_shape_test.py"],
     python_version = "PY3",
+    shard_count = 8,
     srcs_version = "PY3",
     deps = [
         ":ragged",  # fixdeps: keep
@@ -1433,3 +1434,23 @@ py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+py_test(
+    name = "ragged_split_op_test",
+    srcs = ["ragged_split_op_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_tensor",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/platform:test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/ops/ragged/ragged_array_ops.py b/tensorflow/python/ops/ragged/ragged_array_ops.py
index b706161dc5bb7f..469da9e8ff6a8b 100644
--- a/tensorflow/python/ops/ragged/ragged_array_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_array_ops.py
@@ -17,15 +17,18 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_ragged_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sort_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_shape
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.ops.ragged import segment_id_ops
@@ -875,3 +878,163 @@ def dynamic_partition(data: ragged_tensor.RaggedOrDense,
     raise TypeError('num_partitions must be a non-negative integer')
   result = stack_dynamic_partitions(data, partitions, num_partitions, name)
   return [result[i] for i in range(num_partitions)]
+
+
+#===============================================================================
+# split
+#===============================================================================
+@dispatch.dispatch_for_api(array_ops.split)
+def split(value: ragged_tensor.Ragged,
+          num_or_size_splits,
+          axis=0,
+          num=None,
+          name=None):
+  """Splits a RaggedTensor `value` into a list of sub RaggedTensors.
+
+  If `num_or_size_splits` is an `int`,  then it splits `value` along the
+  dimension `axis` into `num_or_size_splits` smaller RaggedTensors. This
+  requires that `value.shape[axis]` is divisible by `num_or_size_splits`.
+
+  If `num_or_size_splits` is a 1-D Tensor (or list), then `value` is split into
+  `len(num_or_size_splits)` elements. The shape of the `i`-th element has the
+  same size as the `value` except along dimension `axis` where the size is
+  `num_or_size_splits[i]`.
+
+  Splits along a ragged dimension is not allowed.
+
+  For example:
+
+  >>> rt = tf.RaggedTensor.from_row_lengths(
+  ...      np.arange(6 * 3).reshape(6, 3), row_lengths=[1, 2, 2, 1])
+  >>> rt.shape
+  TensorShape([4, None, 3])
+  >>>
+  >>> rt1, rt2 = tf.split(rt, 2)  # uniform splits
+  >>> rt1.shape
+  TensorShape([2, None, 3])
+  >>> rt2.shape
+  TensorShape([2, None, 3])
+  >>>
+  >>> rt3, rt4, rt5 = tf.split(rt, [1, 2, 1])  # ragged splits
+  >>> rt3.shape
+  TensorShape([1, None, 3])
+  >>> rt4.shape
+  TensorShape([2, None, 3])
+  >>> rt5.shape
+  TensorShape([1, None, 3])
+  >>>
+  >>> rt6, rt7 = tf.split(rt, [1, 2], axis=2)  # splits along axis 2
+  >>> rt6.shape
+  TensorShape([4, None, 1])
+  >>> rt7.shape
+  TensorShape([4, None, 2])
+
+  Args:
+    value: The `RaggedTensor` to split.
+    num_or_size_splits: Either an `int` indicating the number of splits
+      along `axis` or a 1-D integer `Tensor` or Python list containing the sizes
+      of each output tensor along `axis`. If a Python int, then it must evenly
+      divide `value.shape[axis]`; otherwise the sum of sizes along the split
+      axis must match that of the `value`.
+    axis: An `int` or scalar `int32` `Tensor`. The dimension along which
+      to split. Must be in the range `[-rank(value), rank(value))`. Defaults to
+      0.
+    num: An `int` used to specify the number of outputs when
+      `num_or_size_splits` is a 1-D list or `Tensor` and its length is
+      statically unknown, e.g., specifying `tf.TensorSepc(None)` with
+      the `input_signature` argument of `tf.function` (optional).
+    name: A name for the operation (optional).
+
+  Returns:
+    if `num_or_size_splits` is an `int` returns a list of `num_or_size_splits`
+    `RaggedTensor` objects; if `num_or_size_splits` is a 1-D Tensor returns
+    `num_or_size_splits.get_shape[0]` `RaggedTensor` objects resulting from
+    splitting `value`.
+
+  Raises:
+    ValueError: If the dimension `axis` of `value` is a ragged dimension.
+    ValueError: If `num` is unspecified and cannot be inferred.
+    ValueError: If `num` is specified but doesn't match the length of
+      `num_or_size_splits`.
+    ValueError: If `num_or_size_splits` is an `int` and less than 1.
+    TypeError: If `num_or_size_splits` is not an `int` or 1-D
+      list or 1-D `Tensor`.
+    InvalidArgumentError: If the `axis` of `value` cannot be exactly splitted
+      by `num_or_size_splits`.
+    InvalidArgumentError: If `num_or_size_splits` is contains negative integers.
+    InvalidArgumentError: If `num_or_size_splits`'s static shape is unknown and
+      its dynamic shape is inconsistent `num`.
+    InvalidArgumentError: If `num_or_size_splits`'s static rank is unknown and
+      `axis` is a negative integer.
+  """
+  with ops.name_scope(name, 'RaggedSplit'):
+    if isinstance(num_or_size_splits, int) and num_or_size_splits == 1:
+      return [value]
+
+    # static assert
+    check_ops.assert_integer_v2(
+        num_or_size_splits,
+        message=('`num_or_size_splits` must be an `int` or 1-D list or '
+                 '`Tensor` of integers.'))
+    value_shape = ragged_shape.RaggedShape.from_tensor(value)
+    axis = array_ops.get_positive_axis(axis, value_shape.rank)
+    try:
+      dim_size = value_shape[axis]
+    except ValueError:
+      raise ValueError('Cannot split a ragged dimension. Got `value` with '
+                       f'shape {value_shape} and `axis` {axis}.')
+    if isinstance(num_or_size_splits, int):
+      # Uniform split
+      num_splits = num_or_size_splits
+      if num_splits < 1:
+        raise ValueError('`num_or_size_splits` must be >=1 if it is an `int`.'
+                         f'Received {num_or_size_splits}.')
+      split_length = math_ops.floordiv(dim_size, num_splits)
+      split_lengths = array_ops.repeat(split_length, num_splits)
+    else:
+      # Ragged split
+      num_splits = None
+      split_lengths = ops.convert_to_tensor(num_or_size_splits)
+      if split_lengths.shape.ndims is not None:
+        if split_lengths.shape.ndims != 1:
+          raise TypeError('`num_or_size_splits` must be an `int` or 1-D list '
+                          f'or `Tensor`. Received {num_or_size_splits}.')
+        num_splits = tensor_shape.dimension_value(split_lengths.shape[0])
+
+      if num_splits is None:
+        if num is None:
+          raise ValueError('`num` must be specified as an `int` when the '
+                           'size of `num_or_size_split` is statically '
+                           f'unknown. Received `num`: {num} and '
+                           f'`num_or_size_split`: {num_or_size_splits}.')
+        num_splits = num
+      else:
+        if num is not None and num != num_splits:
+          raise ValueError('`num` does not match the size of '
+                           f'`num_or_size_split`. Received `num`: {num} and '
+                           f'size of `num_or_size_split`: {num_splits}.')
+
+    splits = array_ops.concat([[0], math_ops.cumsum(split_lengths)], axis=0)
+    checks = []
+    checks.append(
+        check_ops.assert_non_negative_v2(
+            num_or_size_splits,
+            message='`num_or_size_splits` must be non-negative.'))
+    checks.append(
+        check_ops.assert_equal_v2(
+            num_splits,
+            array_ops.shape(split_lengths)[0],
+            message='`num` is inconsistent with `num_or_size_split.shape[0]`.'))
+    checks.append(
+        check_ops.assert_equal_v2(
+            math_ops.cast(dim_size, splits.dtype),
+            splits[-1],
+            message=('Cannot exactly split the `axis` dimension of `value` '
+                     'with the given `num_or_size_split`.')))
+    splits = control_flow_ops.with_dependencies(checks, splits)
+    splited_rts = []
+    slices = [slice(None)] * (axis + 1)
+    for i in range(num_splits):
+      slices[-1] = slice(splits[i], splits[i + 1])
+      splited_rts.append(value[tuple(slices)])
+    return splited_rts
diff --git a/tensorflow/python/ops/ragged/ragged_split_op_test.py b/tensorflow/python/ops/ragged/ragged_split_op_test.py
new file mode 100644
index 00000000000000..88fd16d46e1a29
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_split_op_test.py
@@ -0,0 +1,480 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ragged_array_ops.split."""
+
+import itertools
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedSplitOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Uniform splits.
+      #=========================================================================
+      dict(
+          descr='Uniform splits, rank-2 inputs, axis=0',
+          pylist=[1, 2, 3, 4],
+          row_lengths=[1, 3],  # shape=(2, None)
+          num_or_size_splits=2,
+          expected=[
+              [[1]],
+              [[2, 3, 4]]]),
+      dict(
+          descr='Uniform 3 splits, rank-2 inputs, axis=0',
+          pylist=[1, 2, 3, 4],
+          row_lengths=[1, 2, 1],  # shape=(3, None)
+          num_or_size_splits=3,
+          expected=[
+              [[1]],
+              [[2, 3]],
+              [[4]]]),
+      dict(
+          descr='Uniform 5 splits, rank-2 inputs, axis=0',
+          pylist=[1, 2, 3, 4, 5],
+          row_lengths=[1, 1, 1, 1, 1],  # shape=(5, None)
+          num_or_size_splits=5,
+          expected=[
+              [[1]],
+              [[2]],
+              [[3]],
+              [[4]],
+              [[5]]]),
+      dict(
+          descr='Uniform 2 splits, rank-2 inputs(empty), axis=0',
+          pylist=[1, 2, 3, 4],
+          row_lengths=[4, 0],  # shape=(2, None)
+          num_or_size_splits=2,
+          expected=[
+              [[1, 2, 3, 4]],
+              [[]]]),
+      dict(
+          descr='Uniform 2 splits, rank-2 inputs(all empty), axis=0',
+          pylist=[],
+          row_lengths=[0, 0],  # shape=(2, None)
+          num_or_size_splits=2,
+          expected=[
+              [[]],
+              [[]]]),
+      dict(
+          descr='Uniform 1 split, rank-2 inputs, axis=0',
+          pylist=[1, 2, 3, 4],
+          row_lengths=[1, 3],  # shape=(2, None)
+          num_or_size_splits=1,
+          expected=[
+              [[1], [2, 3, 4]]]),
+      dict(
+          descr='Uniform 1 split, rank-2 inputs, axis=1',
+          pylist=[1, 2, 3, 4],
+          row_lengths=[1, 3],  # shape=(2, None)
+          num_or_size_splits=1,
+          axis=1,
+          expected=[
+              [[1], [2, 3, 4]]]),
+      dict(
+          descr='Uniform 2 split, rank-3 inputs, axis=0',
+          pylist=np.arange(4 * 2).reshape(4, 2),
+          row_lengths=[1, 3],  # shape=(2, None, 2)
+          num_or_size_splits=2,
+          expected=[
+              [[[0, 1]]],
+              [[[2, 3], [4, 5], [6, 7]]]]),
+      dict(
+          descr='Uniform 2 splits, rank-3 inputs, axis=2',
+          pylist=np.arange(4 * 2).reshape(4, 2),
+          row_lengths=[1, 3],  # shape=(2, None, 2)
+          num_or_size_splits=2,
+          axis=2,
+          expected=[
+              [[[0]], [[2], [4], [6]]],
+              [[[1]], [[3], [5], [7]]]]),
+      dict(
+          descr='Uniform 2 splits, rank-2 float inputs, axis=0',
+          pylist=[1.0, 2.0, 3.0, 4.0],
+          row_lengths=[1, 3],  # shape=(2, None)
+          num_or_size_splits=2,
+          expected=[
+              [[1.0]],
+              [[2.0, 3.0, 4.0]]]),
+      dict(
+          descr='Uniform 2 splits, rank-2 string inputs, axis=0',
+          pylist=[b'a', b'bc', b'', b'd'],
+          row_lengths=[1, 3],  # shape=(2, None)
+          num_or_size_splits=2,
+          expected=[
+              [[b'a']],
+              [[b'bc', b'', b'd']]]),
+      #=========================================================================
+      # Ragged splits.
+      #=========================================================================
+      dict(
+          descr='Ragged 2 splits, rank-2 inputs, axis=0',
+          pylist=[1, 2, 3, 4],
+          row_lengths=[1, 3],  # shape=(2, None)
+          num_or_size_splits=[1, 1],
+          expected=[
+              [[1]],
+              [[2, 3, 4]]]),
+      dict(
+          descr='Ragged 3 splits, rank-2 inputs, axis=0',
+          pylist=[1, 2, 3, 4],
+          row_lengths=[1, 2, 1],  # shape=(3, None)
+          num_or_size_splits=[1, 2],
+          expected=[
+              [[1]],
+              [[2, 3], [4]]]),
+      dict(
+          descr='Ragged 5 splits, rank-2 inputs(empty), axis=0',
+          pylist=[1, 2, 3, 4, 5],
+          row_lengths=[1, 1, 1, 1, 1],  # shape=(5, None)
+          num_or_size_splits=[1, 2, 2, 0],
+          expected=[
+              [[1]],
+              [[2], [3]],
+              [[4], [5]],
+              []]),
+      dict(
+          descr='Ragged 2 splits, rank-2 inputs(empty), axis=0',
+          pylist=[1, 2, 3, 4],
+          row_lengths=[4, 0, 0, 0],  # shape=(2, None)
+          num_or_size_splits=[3, 1],
+          expected=[
+              [[1, 2, 3, 4], [], []],
+              [[]]]),
+      dict(
+          descr='Ragged 2 splits, rank-2 inputs(all empty), axis=0',
+          pylist=[],
+          row_lengths=[0, 0],  # shape=(2, None)
+          num_or_size_splits=[2, 0],
+          expected=[
+              [[], []],
+              []]),
+      dict(
+          descr='Ragged 1 split, rank-2 inputs, axis=0',
+          pylist=[1, 2, 3, 4],
+          row_lengths=[1, 3],  # shape=(2, None)
+          num_or_size_splits=[2],
+          expected=[
+              [[1], [2, 3, 4]]]),
+      dict(
+          descr='Ragged 2 split, rank-3 inputs, axis=0',
+          pylist=np.arange(4 * 2).reshape(4, 2),
+          row_lengths=[1, 3],  # shape=(2, None, 2)
+          num_or_size_splits=[1, 1],
+          expected=[
+              [[[0, 1]]],
+              [[[2, 3], [4, 5], [6, 7]]]]),
+      dict(
+          descr='Ragged 2 split, rank-3 inputs, axis=-3',
+          pylist=np.arange(4 * 2).reshape(4, 2),
+          row_lengths=[1, 3],  # shape=(2, None, 2)
+          num_or_size_splits=[1, 1],
+          expected=[
+              [[[0, 1]]],
+              [[[2, 3], [4, 5], [6, 7]]]]),
+      dict(
+          descr='Ragged 2 splits, rank-3 inputs, axis=2',
+          pylist=np.arange(4 * 3).reshape(4, 3),
+          row_lengths=[1, 3],  # shape=(2, None, 3)
+          num_or_size_splits=[2, 1],
+          axis=2,
+          expected=[
+              [[[0, 1]], [[3, 4], [6, 7], [9, 10]]],
+              [[[2]], [[5], [8], [11]]]]),
+      dict(
+          descr='Ragged 2 splits, rank-3 inputs, axis=-1',
+          pylist=np.arange(4 * 3).reshape(4, 3),
+          row_lengths=[1, 3],  # shape=(2, None, 3)
+          num_or_size_splits=[2, 1],
+          axis=2,
+          expected=[
+              [[[0, 1]], [[3, 4], [6, 7], [9, 10]]],
+              [[[2]], [[5], [8], [11]]]]),
+      dict(
+          descr='Ragged 3 splits, rank-2 float inputs, axis=0',
+          pylist=[1.0, 2.0, 3.0, 4.0],
+          row_lengths=[1, 2, 1],  # shape=(2, None)
+          num_or_size_splits=[2, 1],
+          expected=[
+              [[1.0], [2.0, 3.0]],
+              [[4.0]]]),
+      dict(
+          descr='Ragged 3 splits with name, rank-2 float inputs, axis=0',
+          pylist=[1.0, 2.0, 3.0, 4.0],
+          row_lengths=[1, 2, 1],  # shape=(2, None)
+          num_or_size_splits=[2, 1],
+          name='ragged_split',
+          expected=[
+              [[1.0], [2.0, 3.0]],
+              [[4.0]]]),
+      dict(
+          descr='Ragged 3 splits with num, rank-2 float inputs, axis=0',
+          pylist=[1.0, 2.0, 3.0, 4.0],
+          row_lengths=[1, 2, 1],  # shape=(2, None)
+          num_or_size_splits=[2, 1],
+          num=2,
+          expected=[
+              [[1.0], [2.0, 3.0]],
+              [[4.0]]]),
+      dict(
+          descr='Ragged 2 splits, rank-2 string inputs, axis=0',
+          pylist=[b'a', b'bc', b'', b'd'],
+          row_lengths=[1, 3, 0],  # shape=(2, None)
+          num_or_size_splits=[2, 1],
+          expected=[
+              [[b'a'], [b'bc', b'', b'd']],
+              [[]]]),
+  ])  # pyformat: disable
+  def testSplit(self,
+                descr,
+                pylist,
+                row_lengths,
+                num_or_size_splits,
+                expected,
+                axis=0,
+                num=None,
+                name=None):
+    rt = ragged_tensor.RaggedTensor.from_row_lengths(pylist, row_lengths)
+    result = ragged_array_ops.split(rt, num_or_size_splits, axis, num, name)
+    self.assertLen(result, len(expected))
+    for res, exp in zip(result, expected):
+      self.assertAllEqual(res, exp)
+
+  @parameterized.parameters([
+      #=========================================================================
+      # Uniform splits errors.
+      #=========================================================================
+      dict(
+          descr='Uniform split, can not split',
+          pylist=[1, 2, 3, 4],
+          row_lengths=[1, 3],  # shape=(2, None)
+          num_or_size_splits=7,
+          exception=errors.InvalidArgumentError,
+          message='Cannot exactly split'),
+      dict(
+          descr='Uniform split, ragged dimension',
+          pylist=[1, 2, 3, 4],
+          row_lengths=[1, 3],  # shape=(2, None)
+          num_or_size_splits=2,
+          axis=1,
+          exception=ValueError,
+          message='ragged dimension'),
+      dict(
+          descr='Uniform split, zero split',
+          pylist=[1, 2, 3, 4],
+          row_lengths=[1, 3],  # shape=(2, None)
+          num_or_size_splits=0,
+          exception=ValueError,
+          message='must be >=1'),
+      #=========================================================================
+      # Ragged splits errors.
+      #=========================================================================
+      dict(
+          descr='Ragged split, 2 dimensional size_splits',
+          pylist=[1, 2, 3, 4],
+          row_lengths=[1, 3],  # shape=(2, None)
+          num_or_size_splits=[[1, 1]],
+          exception=TypeError,
+          message='Python list'),
+      dict(
+          descr='Ragged split, ragged dimension',
+          pylist=[1, 2, 3, 4],
+          row_lengths=[1, 3],  # shape=(2, None)
+          num_or_size_splits=[1, 1],
+          axis=1,
+          exception=ValueError,
+          message='ragged dimension'),
+      dict(
+          descr='Ragged split, cannot split',
+          pylist=[1, 2, 3, 4],
+          row_lengths=[1, 3],  # shape=(2, None)
+          num_or_size_splits=[1, 2],
+          exception=errors.InvalidArgumentError,
+          message='Cannot exactly split'),
+      dict(
+          descr='Ragged split, num does not match',
+          pylist=[1, 2, 3, 4],
+          row_lengths=[1, 3],  # shape=(2, None)
+          num_or_size_splits=[1, 1],
+          num=3,
+          exception=ValueError,
+          message='`num` does not match'),
+      dict(
+          descr='Ragged split, negative split',
+          pylist=[1, 2, 3, 4],
+          row_lengths=[1, 3],  # shape=(2, None)
+          num_or_size_splits=[1, -1, 2],
+          num=3,
+          exception=errors.InvalidArgumentError,
+          message='must be non-negative'),
+      dict(
+          descr='Ragged split, float splits',
+          pylist=[1, 2, 3, 4],
+          row_lengths=[1, 3],  # shape=(2, None)
+          num_or_size_splits=[1.0, 2.0],
+          num=2,
+          exception=TypeError,
+          message='integer'),
+  ])  # pyformat: disable
+  def testSplitError(self,
+                     descr,
+                     pylist,
+                     row_lengths,
+                     num_or_size_splits,
+                     exception,
+                     message,
+                     axis=0,
+                     num=None):
+    rt = ragged_tensor.RaggedTensor.from_row_lengths(pylist, row_lengths)
+    with self.assertRaises(exception):
+      result = ragged_array_ops.split(rt, num_or_size_splits, axis, num)
+      self.evaluate(result)
+
+  @parameterized.named_parameters([
+      ('int32', dtypes.int32),
+      ('int64', dtypes.int64)])
+  def testSplitTensorDtype(self, dtype):
+    rt = ragged_tensor.RaggedTensor.from_row_lengths([1.0, 2.0, 3.0, 4.0],
+                                                     [3, 1])
+    # split_lengths is a 1-D tensor
+    split_lengths = ops.convert_to_tensor([1, 1], dtype=dtype)
+    result = ragged_array_ops.split(rt, split_lengths)
+    expected = [
+        ragged_tensor.RaggedTensor.from_row_lengths([1.0, 2.0, 3.0], [3]),
+        ragged_tensor.RaggedTensor.from_row_lengths([4.0], [1])]
+    self.assertLen(result, len(expected))
+    for res, exp in zip(result, expected):
+      self.assertAllEqual(res, exp)
+
+  @parameterized.parameters([
+      dict(rt_shape=(2, None)),
+      dict(rt_shape=None),
+  ])
+  def testUniformSplitDynamicShape(self, rt_shape):
+    rt = ragged_tensor.RaggedTensor.from_row_lengths([1.0, 2.0, 3.0, 4.0],
+                                                     [3, 1])
+    rt_spec = ragged_tensor.RaggedTensorSpec(rt_shape, ragged_rank=1)
+    @def_function.function(input_signature=[rt_spec])
+    def split_tensors(rt):
+      return ragged_array_ops.split(rt, 2)
+
+    splited_rts = split_tensors(rt)
+    expected_rts = [
+        ragged_tensor.RaggedTensor.from_row_lengths([1.0, 2.0, 3.0], [3]),
+        ragged_tensor.RaggedTensor.from_row_lengths([4.0], [1])]
+    for splited_rt, expected_rt in zip(splited_rts, expected_rts):
+      self.assertAllEqual(splited_rt, expected_rt)
+
+  @parameterized.parameters([
+      dict(rt_shape=x, lengths_shape=y) for x, y in itertools.product(
+          [(2, None), None],
+          [(2,), (None,), None])
+  ])
+  def testRaggedSplitDynamicShape(self, rt_shape, lengths_shape):
+    rt_spec = ragged_tensor.RaggedTensorSpec(rt_shape, ragged_rank=1)
+    lengths_spec = tensor_spec.TensorSpec(lengths_shape, dtype=dtypes.int32)
+    @def_function.function(input_signature=[rt_spec, lengths_spec])
+    def split_tensors(rt, split_lengths):
+      return ragged_array_ops.split(rt, split_lengths, num=2)
+
+    rt = ragged_tensor.RaggedTensor.from_row_lengths([1.0, 2.0, 3.0, 4.0],
+                                                     [3, 1])
+    split_lengths = [1, 1]
+    # split_lengths matches num at runtime
+    splited_rts = split_tensors(rt, split_lengths)
+    expected_rts = [
+        ragged_tensor.RaggedTensor.from_row_lengths([1.0, 2.0, 3.0], [3]),
+        ragged_tensor.RaggedTensor.from_row_lengths([4.0], [1])]
+    for splited_rt, expected_rt in zip(splited_rts, expected_rts):
+      self.assertAllEqual(splited_rt, expected_rt)
+
+  @parameterized.parameters([
+      dict(
+          descr='lengths known rank, num and lengths mismatch',
+          rt_shape=(None, 1),
+          lengths_shape=(None,),
+          lengths=[1, 1, 0],
+          num=2,
+          exception=errors.InvalidArgumentError,
+          message='inconsistent'),
+      dict(
+          descr='lengths unknown rank, num and lengths mismatch',
+          rt_shape=None,
+          lengths_shape=None,
+          lengths=[1, 1, 0],
+          num=2,
+          exception=errors.InvalidArgumentError,
+          message='inconsistent'),
+      dict(
+          descr='rt unknown rank, negative axis',
+          rt_shape=None,
+          lengths_shape=None,
+          lengths=[1, 1],
+          axis=-2,
+          num=2,
+          exception=ValueError,
+          message='negative'),
+      dict(
+          descr='lengths unknown rank, num is None',
+          rt_shape=None,
+          lengths_shape=None,
+          lengths=[1, 1],
+          exception=ValueError,
+          message='`num` must be specified'),
+      dict(
+          descr='lengths unknown rank, dynamic rank!=1',
+          rt_shape=None,
+          lengths_shape=None,
+          lengths=[[1, 1]],
+          num=2,
+          exception=(ValueError, errors.InvalidArgumentError)),
+  ])
+  def testRaggedSplitDynamicShapeError(self,
+                                       descr,
+                                       rt_shape,
+                                       lengths_shape,
+                                       lengths,
+                                       exception,
+                                       message='',
+                                       axis=0,
+                                       num=None):
+    rt_spec = ragged_tensor.RaggedTensorSpec(rt_shape, ragged_rank=1)
+    split_lengths_spec = tensor_spec.TensorSpec(lengths_shape,
+                                                dtype=dtypes.int32)
+    @def_function.function(input_signature=[rt_spec, split_lengths_spec])
+    def split_tensors(rt, split_lengths):
+      return ragged_array_ops.split(rt, split_lengths, axis=axis, num=num)
+
+    rt = ragged_tensor.RaggedTensor.from_row_lengths([1.0, 2.0, 3.0, 4.0],
+                                                     [3, 1])
+    with self.assertRaisesRegex(exception, message):
+      self.evaluate(split_tensors(rt=rt, split_lengths=lengths))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/raw_ops_test.py b/tensorflow/python/ops/raw_ops_test.py
index 953ab570f7d101..5097800d9ea13e 100644
--- a/tensorflow/python/ops/raw_ops_test.py
+++ b/tensorflow/python/ops/raw_ops_test.py
@@ -28,7 +28,6 @@
 
 
 @test_util.run_all_in_graph_and_eager_modes
-@test_util.disable_tfrt
 class RawOpsTest(test.TestCase, parameterized.TestCase):
 
   def testSimple(self):
@@ -63,8 +62,9 @@ def testDefaults(self):
   @parameterized.parameters([[0, 8]], [[-1, 6]])
   def testStringNGramsBadDataSplits(self, splits):
     data = ["aa", "bb", "cc", "dd", "ee", "ff"]
-    with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                "Invalid split value"):
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        r"Invalid split value|First split value must be 0"):
       self.evaluate(
           gen_string_ops.string_n_grams(
               data=data,
@@ -76,6 +76,25 @@ def testStringNGramsBadDataSplits(self, splits):
               pad_width=0,
               preserve_short_sequences=False))
 
+  def testStringSplit(self):
+    data = ["123456"]
+    data_splits = [0, 1]
+    separator = "a" * 15
+    ngram_widths = []
+    pad_width = -5
+    left_pad = right_pad = ""
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "Pad width should be >= 0"):
+      self.evaluate(gen_string_ops.string_n_grams(
+          data=data,
+          data_splits=data_splits,
+          separator=separator,
+          ngram_widths=ngram_widths,
+          left_pad=left_pad,
+          right_pad=right_pad,
+          pad_width=pad_width,
+          preserve_short_sequences=True))
+
   def testGetSessionHandle(self):
     if context.executing_eagerly():
       with self.assertRaisesRegex(
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index 7fd62554eefc57..75ce3290d409e2 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -20,7 +20,6 @@
 import sys as _sys
 
 from tensorflow.python import autograph
-from tensorflow.python.training.experimental import loss_scaling_gradient_tape
 
 # pylint: disable=g-bad-import-order
 # Imports the following modules so that @RegisterGradient get executed.
diff --git a/tensorflow/python/ops/stateful_random_ops.py b/tensorflow/python/ops/stateful_random_ops.py
index 96afbfcab3a316..dafaab3d44301c 100644
--- a/tensorflow/python/ops/stateful_random_ops.py
+++ b/tensorflow/python/ops/stateful_random_ops.py
@@ -470,6 +470,8 @@ def _create_variable(self, *args, **kwargs):
       The created variable.
     """
     with ops.name_scope("random_generator"):
+      # Make sure we don't change this name since Keras was using this name
+      # to filter out the state variable.
       kwargs["name"] = "StateVar"
       v = variables.Variable(*args, **kwargs)
     if isinstance(v, sharded_variable.ShardedVariable):
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index bf636fbd723714..c889b614afe728 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -64,8 +64,7 @@
 # side-effecting ops, this mode produces unspecified results.
 # Setting it to "stateless_cond" automatically sets this mode to True when
 # the loop condition is free of side-effecting ops.
-# TODO(b/152548567): Change this to "stateless_cond".
-glob_stateful_parallelism = False
+glob_stateful_parallelism = "stateless_cond"
 
 
 def while_loop(cond,
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index dc1fda3ed42a13..4e4a361fb5d159 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -232,12 +232,3 @@ py_library(
         "//tensorflow/python/util:tf_export",
     ],
 )
-
-py_library(
-    name = "traceme",
-    srcs = ["traceme.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":trace",
-    ],
-)
diff --git a/tensorflow/python/profiler/trace.py b/tensorflow/python/profiler/trace.py
index 5ab1616ecc3476..6b6bc7ac243a75 100644
--- a/tensorflow/python/profiler/trace.py
+++ b/tensorflow/python/profiler/trace.py
@@ -127,13 +127,15 @@ def trace_wrapper(trace_name, **trace_kwargs):
   """Decorator alternative to `with Trace(): ...`.  It's faster.
 
   Args:
-    trace_name: The name of the trace event.
+    trace_name: The name of the trace event, or a callable to be traced, in
+      which case the name is inferred from qualname or name of the callable.
     **trace_kwargs: Keyword arguments added to the trace event. Both the key and
       value are of types that can be converted to strings, which will be
       interpreted by the profiler according to the traceme name.
 
   Returns:
-    A decorator that can wrap a function and apply `Trace` scope if needed.
+    A decorator that can wrap a function and apply `Trace` scope if needed,
+    or a decorated function if used as a decorator directly.
 
   Example usage:
     ```python
@@ -147,8 +149,30 @@ def func(x, y, z):
     #   func(1, 2, 3)
     func(1, 2, 3)
     ```
+
+  or
+    ```python
+
+    @trace_wrapper
+    def func(x, y, z):
+      pass  # code to execute and apply `Trace` if needed.
+
+    # Equivalent to
+    # with Trace(func.__qualname__):
+    #   func(1, 2, 3)
+    func(1, 2, 3)
+    ```
+
   """
 
+  if callable(trace_name):
+    func = trace_name
+    name = getattr(func, '__qualname__', None)
+    if not name:
+      name = getattr(func, '__name__', 'unknown function')
+
+    return trace_wrapper(name)(func)
+
   def inner_wrapper(func):
 
     @functools.wraps(func)
diff --git a/tensorflow/python/profiler/traceme.py b/tensorflow/python/profiler/traceme.py
deleted file mode 100644
index e21711886d910e..00000000000000
--- a/tensorflow/python/profiler/traceme.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""TraceMe allows the profiler to trace Python events."""
-
-from tensorflow.python.profiler.trace import Trace as TraceMe
-
-
-def traceme_wrapper(func):
-  name = getattr(func, '__qualname__', None)
-  if not name:
-    name = func.__name__
-
-  def wrapper(*args, **kwargs):
-    with TraceMe(name):
-      return func(*args, **kwargs)
-  return wrapper
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index c520c23880b255..a9f3d2e822196b 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Import a trackable object from a SavedModel."""
 
+import collections
 import functools
 import sys
 
@@ -22,7 +23,6 @@
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import values_util
 from tensorflow.python.eager import context
-from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -48,6 +48,7 @@
 from tensorflow.python.training.tracking import base
 from tensorflow.python.training.tracking import data_structures
 from tensorflow.python.training.tracking import graph_view
+from tensorflow.python.training.tracking import resource
 from tensorflow.python.training.tracking import trackable_utils
 from tensorflow.python.training.tracking import tracking
 from tensorflow.python.training.tracking import util
@@ -56,12 +57,12 @@
 
 # API label for SavedModel metrics.
 _LOAD_V2_LABEL = "load_v2"
-
 # Built-in registrations use the "oneof kind" field in the SavedObject proto,
 # instead of "registered_name" field. The "kind" field has almost the same
 # functionality as the registered_name, but only contains built-in TensorFlow
 # types (like variable, functions, assets).
-_BUILT_IN_REGISTRATIONS = {"asset": tracking.Asset}
+_BUILT_IN_REGISTRATIONS = {"asset": tracking.Asset,
+                           "resource": resource.RestoredResource}
 
 
 def _unused_handle():
@@ -149,6 +150,9 @@ def __init__(self, object_graph_proto, saved_model_proto, export_dir,
             library=meta_graph.graph_def.library,
             saved_object_graph=self._proto,
             wrapper_function=_WrapperFunction))
+    # Store a set of all concrete functions that have been set up with
+    # captures.
+    self._restored_concrete_functions = set()
     self._checkpoint_options = ckpt_options
     self._save_options = save_options
 
@@ -276,12 +280,10 @@ def _load_all(self):
     """Loads all nodes and functions from the SavedModel and their edges."""
     self._load_nodes()
     self._load_edges()
-    # TODO(b/124045874): There are limitations with functions whose captures
-    # trigger other functions to be executed. For now it is only guaranteed to
-    # work if the captures of a function only trigger functions without
-    # captures.
-    self._setup_functions_captures()
 
+    # Set up concrete functions that aren't part of the object graph
+    # (e.g. gradient functions)
+    self._setup_remaining_functions()
     self._create_saveable_object_factories()
 
   def _create_saveable_object_factories(self):
@@ -327,88 +329,87 @@ def _add_object_graph_edges(self, proto, node_id):
       if reference.local_name == "__call__" and not callable(obj):
         setattr(type(obj), "__call__", _call_attribute)
 
-  def _setup_functions_captures(self):
-    """Setup captures and variables in restored functions."""
-    concrete_functions = sorted(self._proto.concrete_functions.items())
-    for name, proto in concrete_functions:
-      concrete_function = self._concrete_functions[name]
-      bound_inputs = [
-          self._get_tensor_from_node(node_id, name)
-          for node_id in proto.bound_inputs]
-      bound_variables = [
-          self._nodes[node_id]
-          for node_id in proto.bound_inputs
-          if self._proto.nodes[node_id].WhichOneof("kind") == "variable"
-      ]
-      # TODO(b/205010575): This is only injecting the captured inputs into the
-      # concrete function, note that we did not modify the FuncGraph
-      # itself.
-      captured_inputs_list = []
-      concrete_function._func_graph.variables = bound_variables  # pylint: disable=protected-access
-      if bound_inputs:
-        for bound_input, internal_capture in zip(
-            bound_inputs, concrete_function.inputs[-len(bound_inputs):]):
-          if distribute_utils.is_distributed_variable(bound_input):
-            concrete_function.graph.capture_distributed_variable(
-                bound_input, internal_capture)
-            captured_inputs_list.append(bound_input)
-          elif distribute_utils.is_distributed_table(bound_input):
-            closure, spec = bound_input.resource_handle_call_time_value()
-            concrete_function.graph.replace_capture_with_deferred_capture(
-                bound_input._coordinator_instance.resource_handle,  # pylint: disable=protected-access
-                closure,
-                spec,
-                default_value=bound_input._coordinator_instance.resource_handle,  # pylint: disable=protected-access
-                placeholder=internal_capture)
-            captured_inputs_list.append(
-                concrete_function.graph.deferred_external_captures[-1])
+  def _setup_remaining_functions(self):
+    concrete_function_names = sorted(self._proto.concrete_functions.keys())
+    for name in concrete_function_names:
+      if name in self._restored_concrete_functions:
+        continue
+      self._setup_function_captures(name, self._nodes)
+
+  def _setup_function_captures(self, concrete_function_name, nodes):
+    """Setup captures and variables in a restored function."""
+    self._restored_concrete_functions.add(concrete_function_name)
+    concrete_function = self._concrete_functions[concrete_function_name]
+    proto = self._proto.concrete_functions[concrete_function_name]
+    bound_inputs = [
+        self._get_tensor_from_node(nodes[node_id])
+        for node_id in proto.bound_inputs]
+    bound_variables = [
+        nodes[node_id] for node_id in proto.bound_inputs
+        if self._proto.nodes[node_id].WhichOneof("kind") == "variable"
+    ]
+    # TODO(b/205010575): This is only injecting the captured inputs into the
+    # concrete function, note that we did not modify the FuncGraph
+    # itself.
+    captured_inputs_list = []
+    concrete_function._func_graph.variables = bound_variables  # pylint: disable=protected-access
+    if bound_inputs:
+      for bound_input, internal_capture in zip(
+          bound_inputs, concrete_function.inputs[-len(bound_inputs):]):
+        if distribute_utils.is_distributed_variable(bound_input):
+          concrete_function.graph.capture_distributed_variable(
+              bound_input, internal_capture)
+          captured_inputs_list.append(bound_input)
+        elif distribute_utils.is_distributed_table(bound_input):
+          closure, spec = bound_input.resource_handle_call_time_value()
+          concrete_function.graph.replace_capture_with_deferred_capture(
+              bound_input._coordinator_instance.resource_handle,  # pylint: disable=protected-access
+              closure,
+              spec,
+              default_value=bound_input._coordinator_instance.resource_handle,  # pylint: disable=protected-access
+              placeholder=internal_capture)
+          captured_inputs_list.append(
+              concrete_function.graph.deferred_external_captures[-1])
 
-          else:
-            captured_inputs_list.append(bound_input)
-            concrete_function.graph.replace_capture(bound_input,
-                                                    internal_capture)
-            if internal_capture.dtype == dtypes.resource:
-              if resource_variable_ops.is_resource_variable(bound_input):
-                try:
-                  handle = bound_input.handle
-                except ValueError:
-                  # For mirrored variables we'll copy handle data for components
-                  # as they get captured.
-                  pass
-                else:
-                  handle_data_util.copy_handle_data(handle, internal_capture)
+        else:
+          captured_inputs_list.append(bound_input)
+          concrete_function.graph.replace_capture(bound_input,
+                                                  internal_capture)
+          if internal_capture.dtype == dtypes.resource:
+            if resource_variable_ops.is_resource_variable(bound_input):
+              try:
+                handle = bound_input.handle
+              except ValueError:
+                # For mirrored variables we'll copy handle data for components
+                # as they get captured.
+                pass
               else:
-                handle_data_util.copy_handle_data(bound_input, internal_capture)
-            # Setting "captures" first means "capture" won't create a new
-            # placeholder for this input.
-            concrete_function.graph.capture(bound_input)
+                handle_data_util.copy_handle_data(handle, internal_capture)
+            else:
+              handle_data_util.copy_handle_data(bound_input, internal_capture)
+          # Setting "captures" first means "capture" won't create a new
+          # placeholder for this input.
+          concrete_function.graph.capture(bound_input)
 
-      concrete_function.set_external_captures(captured_inputs_list)
+    concrete_function.set_external_captures(captured_inputs_list)
 
-  def _get_tensor_from_node(self, node_id, fn_name):
+  def _get_tensor_from_node(self, node):
     """Resolves a node id into a tensor to be captured for a function."""
-    if self._node_filters is not None and self._nodes[node_id] is None:
-      raise ValueError(
-          f"Error when processing nodes_to_load. Function '{fn_name}' requires "
-          "inputs/variables that are not loaded when nodes_to_load="
-          f"{self._node_filters}.")
-
     with ops.init_scope():
-      obj = self._nodes[node_id]
-      if distribute_utils.is_distributed_variable(obj):
-        return obj
-      elif distribute_utils.is_distributed_table(obj):
-        return obj
-      elif resource_variable_ops.is_resource_variable(obj):
-        return obj.handle
-      elif isinstance(obj, tracking.Asset):
-        return obj.asset_path
-      elif tensor_util.is_tf_type(obj):
-        return obj
-      elif isinstance(obj, tracking.CapturableResource):
+      if distribute_utils.is_distributed_variable(node):
+        return node
+      elif distribute_utils.is_distributed_table(node):
+        return node
+      elif resource_variable_ops.is_resource_variable(node):
+        return node.handle
+      elif isinstance(node, tracking.Asset):
+        return node.asset_path
+      elif tensor_util.is_tf_type(node):
+        return node
+      elif isinstance(node, tracking.CapturableResource):
         # Note: this executes restored functions in the CapturableResource.
-        return obj.resource_handle
-      raise ValueError(f"Cannot convert node {obj} to tensor.")
+        return node.resource_handle
+      raise ValueError(f"Cannot convert node {node} to tensor.")
 
   def _initialize_loaded_nodes(self):
     nodes = {}
@@ -418,6 +419,34 @@ def _initialize_loaded_nodes(self):
       node_setters[node_id] = setter
     return nodes, node_setters
 
+  def _get_node_dependencies(self, proto):
+    """Returns a dictionary of all dependencies of an object.
+
+    Args:
+      proto: A SavedObject proto.
+
+    Returns:
+      Dict mapping string dependency name *or* int node id to the node id.
+      The int node id key is used for mapping function captures.
+    """
+    dependencies = {ref.local_name: ref.node_id for ref in proto.dependencies}
+    kind = proto.WhichOneof("kind")
+    if kind == "function":
+      concrete_functions = proto.function.concrete_functions
+      for fn_name in concrete_functions:
+        for bound_input in self._proto.concrete_functions[fn_name].bound_inputs:
+          dependencies[bound_input] = bound_input
+    elif kind == "bare_concrete_function":
+      fn_name = proto.bare_concrete_function.concrete_function_name
+      for bound_input in self._proto.concrete_functions[fn_name].bound_inputs:
+        dependencies[bound_input] = bound_input
+    elif kind == "resource":
+      # Make sure that the resource creator is listed as a dependency.
+      for child in proto.children:
+        if child.local_name == "_create_resource":
+          dependencies["_create_resource"] = child.node_id
+    return dependencies
+
   def _generate_ordered_node_ids(self):
     """Orders the node ids so that dependencies appear first."""
     if self._filtered_nodes is None:
@@ -425,22 +454,45 @@ def _generate_ordered_node_ids(self):
     else:
       unordered_ids = list(self._filtered_nodes)
 
-    dependency_map = {}
+    # Maps node ids -> list of dependencies (ids of other nodes that must be
+    # loaded before it).
+    dependency_map = collections.defaultdict(list)
     for node_id in unordered_ids:
-      deps = dependency_map[node_id] = []
+      deps = dependency_map[node_id]
       if self._loaded_nodes.get(node_id) is not None:
         # Deps are only used if the node has not been created.
         continue
-      for reference in self._proto.nodes[node_id].dependencies:
-        dep = reference.node_id
+      proto = self._proto.nodes[node_id]
+      for dep in set(self._get_node_dependencies(proto).values()):
         deps.append(dep)
         if self._filtered_nodes is not None and dep not in self._filtered_nodes:
           raise ValueError(
               "Unable to partially load SavedModel since the specified filter "
-              "does not include all deserialization dependencies. Please "
-              "include this path in the filter: "
+              "does not include all required objects for loading (e.g. "
+              "variables used in functions or deserialization dependencies). "
+              "Please include this path in the filter: "
               f"{self._pretty_printer.node_names[dep]}")
 
+      # Add optimizer slot variable to dependency map.
+      prev_slot = None
+      for slot_variable_proto in proto.slot_variables:
+        slot_variable_node_id = slot_variable_proto.slot_variable_node_id
+        # The optimizer and original variable must be created before the slot
+        # variable, since the slot variable is generated using the Optimizer's
+        # add_slot API.
+        slot_deps = dependency_map[slot_variable_node_id]
+        slot_deps.append(node_id)
+        slot_deps.append(slot_variable_proto.original_variable_node_id)
+
+        if prev_slot is not None:
+          # Add previous slot to deps so that the optimizer slot variables are
+          # added in order. The ordering is needed because the slot name and
+          # variable are both added to ordered lists, which are exposed to the
+          # user via `Optimizer.get_slot_names()` and `Optimizer.weights`.
+          # TODO(kathywu): Maybe enforce some sort of deterministic ordering in
+          # `order_by_dependency` to avoid doing this?
+          slot_deps.append(prev_slot)
+        prev_slot = slot_variable_node_id
     try:
       return list(trackable_utils.order_by_dependency(dependency_map))
     except trackable_utils.CyclicDependencyError:
@@ -464,30 +516,23 @@ def _load_nodes(self):
 
     # Figure out which objects are slot variables. These objects are created
     # with Optimizer.add_slot rather than _recreate_variable.
-    slot_variable_node_ids = set()
+    # Maps slot node id -> optimizer node id, SlotVariableReference proto
+    slot_variable_node_ids = {}
 
-    for _, proto in self._iter_all_nodes():
-      for slot_variable_proto in proto.slot_variables:
-        slot_variable_node_ids.add(slot_variable_proto.slot_variable_node_id)
-
-    # Re-create everything except slot variables.
     for node_id, proto in self._iter_all_nodes():
-      if node_id in slot_variable_node_ids or nodes.get(node_id) is not None:
-        # Defer recreating slot variables so we can use the public Optimizer
-        # interface.
-        continue
-      node, setter = self._recreate(proto, node_id, nodes)
-      nodes[node_id] = node
-      node_setters[node_id] = setter
+      for slot_variable_proto in proto.slot_variables:
+        slot_variable_node_id = slot_variable_proto.slot_variable_node_id
+        slot_variable_node_ids[slot_variable_node_id] = (node_id,
+                                                         slot_variable_proto)
 
-    # Now that we have created the variables being optimized, we have enough
-    # information to re-create slot variables for them.
+    # Re-create everything.
     for node_id, proto in self._iter_all_nodes():
-      if node_id not in nodes:
-        # This is a slot variable that has not been created yet.
+      if nodes.get(node_id) is not None:
         continue
-      optimizer_object = nodes[node_id]
-      for slot_variable_proto in proto.slot_variables:
+      elif node_id in slot_variable_node_ids:
+        # Use the public Optimizer interface when creating slot variables.
+        optimizer_node_id, slot_variable_proto = slot_variable_node_ids[node_id]
+        optimizer_object = nodes[optimizer_node_id]
         optimized_variable = nodes[
             slot_variable_proto.original_variable_node_id]
         slot_variable = optimizer_object.add_slot(
@@ -495,6 +540,10 @@ def _load_nodes(self):
             slot_name=slot_variable_proto.slot_name)
         nodes[slot_variable_proto.slot_variable_node_id] = slot_variable
         node_setters[slot_variable_proto.slot_variable_node_id] = setattr
+      else:
+        node, setter = self._recreate(proto, node_id, nodes)
+        nodes[node_id] = node
+        node_setters[node_id] = setter
 
     # If root object is not loaded, add a dummy root object for checkpoint
     # compatibility.
@@ -586,36 +635,35 @@ def _recreate(self, proto, node_id, nodes):
       the trackable children.
     """
     registered_class = registration.get_registered_class(proto.registered_name)
-    if registered_class is not None:
-      user_proto = proto.serialized_user_proto
-    else:
+    if registered_class is None:
       registered_class = _BUILT_IN_REGISTRATIONS.get(proto.WhichOneof("kind"))
-      user_proto = proto
+
+    dependencies = {}
+    for key, dep_node_id in self._get_node_dependencies(proto).items():
+      dependencies[key] = nodes[dep_node_id]
+
     if registered_class:
-      dependencies = {}
-      for reference in proto.dependencies:
-        dependencies[reference.local_name] = nodes[reference.node_id]
       obj = registered_class._deserialize_from_proto(  # pylint: disable=protected-access
-          proto=user_proto,
+          proto=proto.serialized_user_proto,
+          object_proto=proto,
           dependencies=dependencies,
           export_dir=self._export_dir,
           asset_file_def=self._asset_file_def)
       return obj, type(obj)._add_trackable_child  # pylint: disable=protected-access
     else:
-      return self._recreate_default(proto, node_id)
+      return self._recreate_default(proto, node_id, dependencies)
 
-  def _recreate_default(self, proto, node_id):
+  def _recreate_default(self, proto, node_id, deps):
     """Creates a Python object from a SavedObject protocol buffer."""
     factory = {
         "user_object": (
             lambda: self._recreate_user_object(proto.user_object, node_id)),
-        "function": lambda: self._recreate_function(proto.function),
+        "function": lambda: self._recreate_function(proto.function, deps),
         "bare_concrete_function": functools.partial(
             self._recreate_bare_concrete_function,
-            proto.bare_concrete_function),
+            proto=proto.bare_concrete_function, dependencies=deps),
         "variable": lambda: self._recreate_variable(proto.variable),
         "constant": lambda: self._recreate_constant(proto.constant),
-        "resource": lambda: self._recreate_resource(proto.resource),
         "captured_tensor": functools.partial(
             self._get_tensor_from_fn, proto.captured_tensor),
     }
@@ -643,13 +691,18 @@ class _UserObject(tracking.AutoTrackable):
 
     return _UserObject(), setattr
 
-  def _recreate_function(self, proto):
-    return function_deserialization.recreate_function(
-        proto, self._concrete_functions), setattr
+  def _recreate_function(self, proto, dependencies):
+    fn = function_deserialization.recreate_function(
+        proto, self._concrete_functions)
+    for name in proto.concrete_functions:
+      self._setup_function_captures(name, dependencies)
+    return fn, setattr
 
-  def _recreate_bare_concrete_function(self, proto):
-    return function_deserialization.setup_bare_concrete_function(
-        proto, self._concrete_functions), setattr
+  def _recreate_bare_concrete_function(self, proto, dependencies):
+    fn = function_deserialization.setup_bare_concrete_function(
+        proto, self._concrete_functions)
+    self._setup_function_captures(proto.concrete_function_name, dependencies)
+    return fn, setattr
 
   def _recreate_variable(self, proto):
     name = proto.name if proto.name else None
@@ -696,54 +749,11 @@ def _get_tensor_from_fn(self, proto):
     captured_tensor = outer_graph.get_tensor_by_name(proto.name)
     return captured_tensor, setattr
 
-  def _recreate_resource(self, proto):
-    return _RestoredResource(device=proto.device), _setattr_and_track
-
-
-# TODO(b/124205571,b/124092991): Solve destruction of resources.
-class _RestoredResource(tracking.TrackableResource):
-  """Restored SavedResource."""
-
-  def __init__(self, device=""):
-    super(_RestoredResource, self).__init__(device=device)
-
-  def _create_resource(self):
-    raise RuntimeError()
-
-  def _initialize(self):
-    raise RuntimeError()
-
-  # _list_functions_for_serialization expects Function objects, but unlike
-  # _create_resource and _initialize, _destroy_function didn't always exist in
-  # older TrackableResource implementations, so this default stub must be a
-  # Function.
-  @def_function.function
-  def _destroy_resource(self):
-    raise RuntimeError()
-
-  def _list_functions_for_serialization(self, unused_serialization_cache):
-    # Overwrite this method to avoid the implementation of
-    # base class to re-wrap the polymorphic functions into
-    # another layer of `tf.function`.
-    functions = {
-        "_create_resource": self._create_resource,
-        "_initialize": self._initialize,
-        "_destroy_resource": self._destroy_resource,
-    }
-    return functions
-
 
 def _call_attribute(instance, *args, **kwargs):
   return instance.__call__(*args, **kwargs)
 
 
-def _setattr_and_track(obj, name, value):
-  """Sets new attribute and marks it as a dependency if Trackable."""
-  setattr(obj, name, value)
-  if isinstance(value, base.Trackable):
-    obj._track_trackable(value, name)  # pylint:disable=protected-access
-
-
 @tf_export("__internal__.saved_model.load_partial", v1=[])
 def load_partial(export_dir, filters, tags=None, options=None):
   """Partially load a SavedModel (saved from V2).
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index 8533e9aa1f2761..d4cc3dc3337f3f 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -29,7 +29,6 @@
 
 
 from tensorflow.python.client import session as session_lib
-from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
 from tensorflow.python.eager import backprop
@@ -1791,49 +1790,6 @@ def test_variable_attributes_preserved(self, cycles):
     self.assertEqual(variables.VariableAggregation.ONLY_FIRST_REPLICA,
                      root.v.aggregation)
 
-  def test_captured_dataset_with_asset(self, cycles):
-
-    class HasDataset(module.Module):
-
-      def __init__(self, temp_dir, file_name):
-        super(HasDataset, self).__init__()
-        file = os.path.join(temp_dir, file_name)
-        with tf_record.TFRecordWriter(file, "GZIP") as f:
-          for v in ["a", "aa", "aaa"]:
-            f.write(str(v))
-        self.dataset = readers.TFRecordDataset([file], compression_type="GZIP")
-
-      @def_function.function
-      def __call__(self, x):
-        current_sum = array_ops.zeros([], dtype=dtypes.int32)
-        for element in self.dataset:
-          current_sum += x * string_ops.string_length(element)
-        return current_sum
-
-    temp_dir = self.get_temp_dir()
-    file_name = "tf_record_asset.tfrecord.gz"
-    root = HasDataset(temp_dir, file_name)
-    self.assertEqual(
-        18,  # 3 * (1 + 2 + 3)
-        root(constant_op.constant(3, dtype=dtypes.int32)).numpy())
-
-    save_dir = os.path.join(self.get_temp_dir(), "save_dir")
-    save.save(root, save_dir)
-
-    file_io.delete_file(os.path.join(temp_dir, file_name))
-    asset_path = os.path.join(save_dir, "assets/{}".format(file_name))
-    if compat.forward_compatible(2021, 9, 20):
-      self.assertTrue(file_io.file_exists(asset_path))
-      load_dir = os.path.join(self.get_temp_dir(), "load_dir")
-      file_io.rename(save_dir, load_dir)
-
-      # TODO(b/188455028): Remove assertRaises block and check that invoking
-      # loaded SavedModel behaves as expected.
-      with self.assertRaises(ValueError) as error:
-        _ = load.load(load_dir)
-      self.assertEqual("Signature specifies 1 arguments, got: 0.",
-                       str(error.exception))
-
   def test_captured_dataset(self, cycles):
 
     class HasDataset(module.Module):
@@ -2281,7 +2237,8 @@ def __call__(self, y):
     adder(5)
     self.assertEqual(self.evaluate(v), 6)
 
-    with self.assertRaisesRegex(ValueError, "requires inputs/variables"):
+    with self.assertRaisesRegex(
+        ValueError, "does not include all required objects for loading"):
       imported = load.load_partial(save_dir, ["root.adder"])
 
   def test_load_partial_checkpoint(self):
@@ -2391,6 +2348,46 @@ def test_garbage_collection_capturable_resource_doesnt_raise_exception(self):
     if "Exception ignored in" in stderr.getvalue():
       raise Exception(stderr.getvalue())
 
+  def test_captured_dataset_with_asset(self):
+
+    class HasDataset(module.Module):
+
+      def __init__(self, temp_dir, file_name):
+        super(HasDataset, self).__init__()
+        file = os.path.join(temp_dir, file_name)
+        with tf_record.TFRecordWriter(file, "GZIP") as f:
+          for v in ["a", "aa", "aaa"]:
+            f.write(str(v))
+        self.dataset = readers.TFRecordDataset([file], compression_type="GZIP")
+
+      @def_function.function
+      def __call__(self, x):
+        current_sum = array_ops.zeros([], dtype=dtypes.int32)
+        for element in self.dataset:
+          current_sum += x * string_ops.string_length(element)
+        return current_sum
+
+    temp_dir = self.get_temp_dir()
+    file_name = "tf_record_asset.tfrecord.gz"
+    root = HasDataset(temp_dir, file_name)
+    self.assertEqual(
+        18,  # 3 * (1 + 2 + 3)
+        root(constant_op.constant(3, dtype=dtypes.int32)).numpy())
+
+    save_dir = os.path.join(self.get_temp_dir(), "save_dir")
+    save.save(root, save_dir)
+
+    file_io.delete_file(os.path.join(temp_dir, file_name))
+    asset_path = os.path.join(save_dir, "assets/{}".format(file_name))
+    self.assertTrue(file_io.file_exists(asset_path))
+    load_dir = os.path.join(self.get_temp_dir(), "load_dir")
+    file_io.rename(save_dir, load_dir)
+
+    loaded = load.load(load_dir)
+    self.assertEqual(
+        18,  # 3 * (1 + 2 + 3)
+        loaded(constant_op.constant(3, dtype=dtypes.int32)).numpy())
+
 
 class DeferredInitModuleVariablesTest(test.TestCase):
 
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 82c061dd7b20e2..4d97a926aa0391 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -54,7 +54,7 @@ def parse_saved_model_with_debug_info(export_dir):
     IOError: If the saved model file does not exist, or cannot be successfully
     parsed. Missing graph debug info file is fine.
   """
-  saved_model = _parse_saved_model(export_dir)
+  saved_model = parse_saved_model(export_dir)
 
   debug_info_path = file_io.join(
       saved_model_utils.get_debug_dir(export_dir),
@@ -118,11 +118,6 @@ def parse_saved_model(export_dir):
         f"{constants.SAVED_MODEL_FILENAME_PB}}}")
 
 
-# TODO(b/120594573): Make this symbol also available as private, so that
-# tensorflow_transform and tensorflow_estimator do not break.
-_parse_saved_model = parse_saved_model
-
-
 def get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
   """Gets the asset tensors, if defined in the meta graph def to load.
 
diff --git a/tensorflow/python/tools/saved_model_aot_compile.py b/tensorflow/python/tools/saved_model_aot_compile.py
index 311c37119f7b21..4ca907b091e993 100644
--- a/tensorflow/python/tools/saved_model_aot_compile.py
+++ b/tensorflow/python/tools/saved_model_aot_compile.py
@@ -19,6 +19,7 @@
 import os
 import re
 import shlex
+from typing import List, Tuple
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
@@ -200,46 +201,26 @@ def _prune_removed_feed_nodes(signature_def, graph_def):
   return new_signature_def
 
 
-def aot_compile_cpu_meta_graph_def(checkpoint_path,
-                                   meta_graph_def,
-                                   output_prefix,
-                                   signature_def_key,
-                                   cpp_class,
-                                   target_triple,
-                                   target_cpu,
-                                   variables_to_feed=(),
-                                   multithreading=False):
-  """Compile a `MetaGraphDef` to header+object files in `output_prefix`.
-
-  Use XLA AOT (`tfcompile`) to convert the given meta graph and
-  signature into a header + object files.  Also create an include makefile
-  that helps identify the appropriate necessary include and library paths
-  to incorporate these files into your C++ program.
+def freeze_model(checkpoint_path: str,
+                 meta_graph_def: meta_graph_pb2.MetaGraphDef,
+                 output_prefix: str, signature_def_key: str,
+                 variables_to_feed: List[str]) -> Tuple[str, str]:
+  """Freeze a `MetaGraphDef` in preparation for tfcompile`.
 
   The graph is always optimized with grappler, and optionally (by default)
   variables are frozen as constants, before compilation happens.
 
-  If the `freeze_graph` is `True`, all variables are embedded as constants
-  into the graph and binary objects.  If it is `False`, then the variable
-  values become inputs and outputs of the compiled class and the C++
-  caller must set these values manually.
-
   Args:
     checkpoint_path: Python string.  Path to checkpoints/variables.
     meta_graph_def: Instance of `MetaGraphDef`.
     output_prefix: Python string.  Path prefix for outputs.
     signature_def_key: String, the signature_def to use in the SavedModel.
-    cpp_class: String, Name of output C++ class.
-    target_triple: String, LLVM target triple.
-    target_cpu: String, LLVM target cpu name.
     variables_to_feed: A list of strings, the variables that will be fed by the
       user; these won't be frozen.  If `None`, then we will extract all the
       variables in the graph and mark them as to-feed.  The default behavior is
       an empty tuple: all variables must be frozen.
-    multithreading: Whether to enable multithreading in the compiled
-      computation.  Note that if using this option, the resulting object files
-      may have external dependencies on multithreading libraries like nsync.
-
+  Returns:
+    a pair containing the path to the frozen model and the path to the config.
   Raises:
     RuntimeError: If tensorflow was not built with XLA.
     ImportError: If tensorflow was built with XLA but there was another
@@ -250,18 +231,6 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
   if _pywrap_tfcompile_import_error:
     raise _pywrap_tfcompile_import_error  # pylint: disable=raising-bad-type
 
-  else:
-    # TODO(ebrevdo): Pipe DebugOptions through tfcompile::Main and pywrap
-    # so that we can set these directly instead of relying on env vars.
-    xla_flags = os.environ.get('XLA_FLAGS')
-    if not xla_flags:
-      xla_flags = '--xla_cpu_multi_thread_eigen={}'.format(
-          'true' if multithreading else 'false')
-    else:
-      xla_flags += ' --xla_cpu_multi_thread_eigen={}'.format(
-          'true' if multithreading else 'false')
-    os.environ['XLA_FLAGS'] = xla_flags
-
   signature_def_map = meta_graph_def.signature_def
   if signature_def_key not in signature_def_map:
     raise ValueError(
@@ -274,10 +243,10 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
         f'Signature key {signature_def_key} must have outputs, but saw none:\n'
         f'{str(signature_def)}')
 
-  temp_dir = test.get_temp_dir()
-  file_io.recursive_create_dir(temp_dir)
+  file_io.recursive_create_dir(output_prefix)
   if logging.get_verbosity() >= logging.INFO:
-    original_graph_def_location = os.path.join(temp_dir, 'original_graph.pb')
+    original_graph_def_location = os.path.join(output_prefix,
+                                               'original_graph.pb')
     with file_io.FileIO(original_graph_def_location, 'wb') as graph_writer:
       graph_writer.write(meta_graph_def.graph_def.SerializeToString())
 
@@ -301,7 +270,8 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
     ]
 
   if logging.get_verbosity() >= logging.INFO:
-    prefrozen_graph_def_location = os.path.join(temp_dir, 'prefrozen_graph.pb')
+    prefrozen_graph_def_location = os.path.join(output_prefix,
+                                                'prefrozen_graph.pb')
     with file_io.FileIO(prefrozen_graph_def_location, 'wb') as graph_writer:
       graph_writer.write(graph_def.SerializeToString())
 
@@ -325,8 +295,8 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
 
   signature_def = _prune_removed_feed_nodes(signature_def, graph_def)
 
-  frozen_graph_def_location = os.path.join(temp_dir, 'frozen_graph.pb')
-  config_pbtxt_location = os.path.join(temp_dir, 'config.pbtxt')
+  frozen_graph_def_location = os.path.join(output_prefix, 'frozen_graph.pb')
+  config_pbtxt_location = os.path.join(output_prefix, 'config.pbtxt')
   logging.info('Writing graph def to: {}'.format(frozen_graph_def_location))
   with file_io.FileIO(frozen_graph_def_location, 'wb') as graph_writer:
     graph_writer.write(graph_def.SerializeToString())
@@ -335,7 +305,81 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
   logging.info('Writing config_pbtxt to: {}'.format(config_pbtxt_location))
   with file_io.FileIO(config_pbtxt_location, mode='w') as config_writer:
     config_writer.write(str(config))
+  return frozen_graph_def_location, config_pbtxt_location
 
+
+def aot_compile_cpu_meta_graph_def(checkpoint_path,
+                                   meta_graph_def,
+                                   output_prefix,
+                                   signature_def_key,
+                                   cpp_class,
+                                   target_triple,
+                                   target_cpu,
+                                   variables_to_feed=(),
+                                   multithreading=False):
+  """Compile a `MetaGraphDef` to header+object files in `output_prefix`.
+
+  Use XLA AOT (`tfcompile`) to convert the given meta graph and
+  signature into a header + object files.  Also create an include makefile
+  that helps identify the appropriate necessary include and library paths
+  to incorporate these files into your C++ program.
+
+  Freezing a graph entails restoring the checkpoint and replacing any inputs and
+  variables with constants. If values are feed, those are used, else inputs are
+  replaced with default all-zero constants. Finally, the graph is pruned and
+  then optimized with grappler.
+
+  If the `freeze_graph` is `True`, all variables are embedded as constants
+  into the graph and binary objects.  If it is `False`, then the variable
+  values become inputs and outputs of the compiled class and the C++
+  caller must set these values manually.
+
+  Args:
+    checkpoint_path: Python string.  Path to checkpoints/variables.
+    meta_graph_def: Instance of `MetaGraphDef`.
+    output_prefix: Python string.  Path prefix for outputs.
+    signature_def_key: String, the signature_def to use in the SavedModel.
+    cpp_class: String, Name of output C++ class.
+    target_triple: String, LLVM target triple.
+    target_cpu: String, LLVM target cpu name.
+    variables_to_feed: A list of strings, the variables that will be fed by the
+      user; these won't be frozen.  If `None`, then we will extract all the
+      variables in the graph and mark them as to-feed.  The default behavior is
+      an empty tuple: all variables must be frozen.
+    multithreading: Whether to enable multithreading in the compiled
+      computation.  Note that if using this option, the resulting object files
+      may have external dependencies on multithreading libraries like nsync.
+
+  Raises:
+    RuntimeError: If tensorflow was not built with XLA.
+    ImportError: If tensorflow was built with XLA but there was another
+      issue importing the tfcompile python wrapper.
+    ValueError: If `meta_graph_def.signature_def[signature_def_key]` is
+      missing or has empty outputs.
+  """
+  if _pywrap_tfcompile_import_error:
+    raise _pywrap_tfcompile_import_error  # pylint: disable=raising-bad-type
+
+  else:
+    # TODO(ebrevdo): Pipe DebugOptions through tfcompile::Main and pywrap
+    # so that we can set these directly instead of relying on env vars.
+    xla_flags = os.environ.get('XLA_FLAGS')
+    if not xla_flags:
+      xla_flags = '--xla_cpu_multi_thread_eigen={}'.format(
+          'true' if multithreading else 'false')
+    else:
+      xla_flags += ' --xla_cpu_multi_thread_eigen={}'.format(
+          'true' if multithreading else 'false')
+    os.environ['XLA_FLAGS'] = xla_flags
+
+  temp_dir = test.get_temp_dir()
+  file_io.recursive_create_dir(temp_dir)
+  frozen_graph_def_location, config_pbtxt_location = freeze_model(
+      checkpoint_path=checkpoint_path,
+      meta_graph_def=meta_graph_def,
+      output_prefix=temp_dir,
+      signature_def_key=signature_def_key,
+      variables_to_feed=variables_to_feed)
   output_dir = os.path.dirname(output_prefix)
   file_io.recursive_create_dir(output_dir)
 
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index 90a3656aebd998..81b368a8a961a0 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -821,6 +821,31 @@ def convert_with_tensorrt(args):
         output_saved_model_dir=args.output_dir)
 
 
+def freeze_model(args):
+  """Function triggered by freeze_model command.
+
+  Args:
+    args: A namespace parsed from command line.
+  """
+  checkpoint_path = (
+      args.checkpoint_path
+      or os.path.join(args.dir, 'variables/variables'))
+  if not args.variables_to_feed:
+    variables_to_feed = []
+  elif args.variables_to_feed.lower() == 'all':
+    variables_to_feed = None  # We will identify them after.
+  else:
+    variables_to_feed = args.variables_to_feed.split(',')
+
+  saved_model_aot_compile.freeze_model(
+      checkpoint_path=checkpoint_path,
+      meta_graph_def=saved_model_utils.get_meta_graph_def(
+          args.dir, args.tag_set),
+      signature_def_key=args.signature_def_key,
+      variables_to_feed=variables_to_feed,
+      output_prefix=args.output_prefix)
+
+
 def aot_compile_cpu(args):
   """Function triggered by aot_compile_cpu command.
 
@@ -1059,6 +1084,70 @@ def add_convert_subparser(subparsers):
   parser_convert_with_tensorrt.set_defaults(func=convert_with_tensorrt)
 
 
+def _parse_common_freeze_and_aot(parser_compile):
+  """Parse arguments shared by freeze model and aot_compile."""
+  parser_compile.add_argument(
+      '--dir',
+      type=str,
+      required=True,
+      help='directory containing the SavedModel to convert')
+  parser_compile.add_argument(
+      '--output_prefix',
+      type=str,
+      required=True,
+      help=('output directory + filename prefix for the resulting header(s) '
+            'and object file(s)'))
+  parser_compile.add_argument(
+      '--tag_set',
+      type=str,
+      required=True,
+      help='tag-set of graph in SavedModel to convert, separated by \',\'')
+  parser_compile.add_argument(
+      '--signature_def_key',
+      type=str,
+      default=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
+      help=('signature_def key to use.  '
+            'default: DEFAULT_SERVING_SIGNATURE_DEF_KEY'))
+  parser_compile.add_argument(
+      '--checkpoint_path',
+      type=str,
+      default=None,
+      help='Custom checkpoint to use (default: use the SavedModel variables)')
+  parser_compile.add_argument(
+      '--variables_to_feed',
+      type=str,
+      default='',
+      help=('The names of variables that will be fed into the network.  '
+            'Options are: empty (default; all variables are frozen, none may '
+            'be fed), \'all\' (all variables may be fed), or a '
+            'comma-delimited list of names of variables that may be fed.  In '
+            'the last case, the non-fed variables will be frozen in the graph.'
+            '**NOTE** Any variables passed to `variables_to_feed` *must be set '
+            'by the user*.  These variables will NOT be frozen and their '
+            'values will be uninitialized in the compiled object '
+            '(this applies to all input arguments from the signature as '
+            'well).'))
+
+
+def add_freeze_model_subparser(subparsers):
+  """Add parser for `freeze_model`."""
+  compile_msg = '\n'.join(
+      ['Usage example:',
+       'To freeze a SavedModel in preparation for tfcompile:',
+       '$saved_model_cli freeze_model \\',
+       '   --dir /tmp/saved_model \\',
+       '   --tag_set serve \\',
+       '   --output_prefix /tmp/saved_model_xla_aot',
+      ])
+
+  parser_compile = subparsers.add_parser(
+      'freeze_model',
+      description=compile_msg,
+      formatter_class=argparse.RawTextHelpFormatter)
+  _parse_common_freeze_and_aot(parser_compile)
+  parser_compile.set_defaults(func=freeze_model)
+
+
 def add_aot_compile_cpu_subparser(subparsers):
   """Add parser for `aot_compile_cpu`."""
   compile_msg = '\n'.join(
@@ -1089,28 +1178,7 @@ def add_aot_compile_cpu_subparser(subparsers):
       'aot_compile_cpu',
       description=compile_msg,
       formatter_class=argparse.RawTextHelpFormatter)
-  parser_compile.add_argument(
-      '--dir',
-      type=str,
-      required=True,
-      help='directory containing the SavedModel to convert')
-  parser_compile.add_argument(
-      '--output_prefix',
-      type=str,
-      required=True,
-      help=('output directory + filename prefix for the resulting header(s) '
-            'and object file(s)'))
-  parser_compile.add_argument(
-      '--tag_set',
-      type=str,
-      required=True,
-      help='tag-set of graph in SavedModel to convert, separated by \',\'')
-  parser_compile.add_argument(
-      '--signature_def_key',
-      type=str,
-      default=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
-      help=('signature_def key to use.  '
-            'default: DEFAULT_SERVING_SIGNATURE_DEF_KEY'))
+  _parse_common_freeze_and_aot(parser_compile)
   parser_compile.add_argument(
       '--target_triple',
       type=str,
@@ -1127,11 +1195,6 @@ def add_aot_compile_cpu_subparser(subparsers):
             'x86_64, skylake, haswell, westmere, <empty> (unknown).  For '
             'a complete list of options, run (for x86 targets): '
             '`llc -march=x86 -mcpu=help`'))
-  parser_compile.add_argument(
-      '--checkpoint_path',
-      type=str,
-      default=None,
-      help='Custom checkpoint to use (default: use the SavedModel variables)')
   parser_compile.add_argument(
       '--cpp_class',
       type=str,
@@ -1143,20 +1206,6 @@ def add_aot_compile_cpu_subparser(subparsers):
             'may precede the class name, separated by double-colons.  '
             'The class will be generated in the given namespace(s), or if no '
             'namespaces are given, within the global namespace.'))
-  parser_compile.add_argument(
-      '--variables_to_feed',
-      type=str,
-      default='',
-      help=('The names of variables that will be fed into the network.  '
-            'Options are: empty (default; all variables are frozen, none may '
-            'be fed), \'all\' (all variables may be fed), or a '
-            'comma-delimited list of names of variables that may be fed.  In '
-            'the last case, the non-fed variables will be frozen in the graph.'
-            '**NOTE** Any variables passed to `variables_to_feed` *must be set '
-            'by the user*.  These variables will NOT be frozen and their '
-            'values will be uninitialized in the compiled object '
-            '(this applies to all input arguments from the signature as '
-            'well).'))
   parser_compile.add_argument(
       '--multithreading',
       type=str,
@@ -1197,6 +1246,8 @@ def create_parser():
   # aot_compile_cpu command
   add_aot_compile_cpu_subparser(subparsers)
 
+  # freeze_model command
+  add_freeze_model_subparser(subparsers)
   return parser
 
 
diff --git a/tensorflow/python/tools/saved_model_cli_test.py b/tensorflow/python/tools/saved_model_cli_test.py
index 424f156b86b883..c5971555201639 100644
--- a/tensorflow/python/tools/saved_model_cli_test.py
+++ b/tensorflow/python/tools/saved_model_cli_test.py
@@ -870,6 +870,35 @@ def testAOTCompileCPUFreezesAndCompiles(
         '{}_makefile.inc'.format(output_prefix))
     self.assertIn('-D_GLIBCXX_USE_CXX11_ABI=', makefile_contents)
 
+  def testFreezeModel(self):
+    if not test.is_built_with_xla():
+      self.skipTest('Skipping test because XLA is not compiled in.')
+
+    variables_to_feed = 'all'
+    func = 'func2'
+    saved_model_dir = os.path.join(test.get_temp_dir(), 'dummy_model')
+    dummy_model = self.AOTCompileDummyModel()
+    func = getattr(dummy_model, func)
+    with self.cached_session():
+      self.evaluate(dummy_model.var.initializer)
+      self.evaluate(dummy_model.write_var.initializer)
+      save.save(dummy_model, saved_model_dir, signatures={'func': func})
+
+    self.parser = saved_model_cli.create_parser()
+    output_prefix = os.path.join(test.get_temp_dir(), 'aot_compile_cpu_dir/out')
+    args = [  # Use the default seving signature_key.
+        'freeze_model', '--dir', saved_model_dir, '--tag_set', 'serve',
+        '--signature_def_key', 'func', '--output_prefix', output_prefix,
+        '--variables_to_feed', variables_to_feed
+    ]
+    args = self.parser.parse_args(args)
+    with test.mock.patch.object(logging, 'warn'):
+      saved_model_cli.freeze_model(args)
+    self.assertTrue(
+        file_io.file_exists(os.path.join(output_prefix, 'frozen_graph.pb')))
+    self.assertTrue(
+        file_io.file_exists(os.path.join(output_prefix, 'config.pbtxt')))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/tpu/tpu_embedding_v2.py b/tensorflow/python/tpu/tpu_embedding_v2.py
index 24874e508259d0..1b1aaa6b48e8ea 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2.py
@@ -15,7 +15,7 @@
 """Mid level API for TPU Embeddings."""
 
 import functools
-from typing import Any, Dict, Callable, Iterable, List, Optional, Text, Tuple, Union
+from typing import Any, Callable, Dict, Iterable, List, Optional, Text, Tuple, Union
 
 from absl import logging
 
@@ -33,9 +33,11 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework.tensor_shape import TensorShape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import numpy_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as tf_variables
@@ -155,11 +157,26 @@ class TPUEmbedding(tracking.AutoTrackable):
   dataset_iterator = iter(distributed_dataset)
   ```
 
-  NOTE: All batches passed to the layer must have the same batch size for each
-  input, more over once you have called the layer with one batch size all
-  subsequent calls must use the same batch_size. In the event that the batch
-  size cannot be automatically determined by the enqueue method, you must call
-  the build method with the batch size to initialize the layer.
+  Different feature inputs can have different shapes. For dense and sparse
+  tensor, rank 2 and above is supported. For ragged tensor, although only rank 2
+  is supported, you can specify the output shape to be rank 2 and above. The
+  output shape specified in the FeatureConfig has the first priority. The input
+  shape passed in build method has second priority and the input shapes
+  auto detected from input feature has the lowest priority. The latter two will
+  be converted to output shapes by omitting the last dimension. If the lower
+  priority one has output shapes which don't match the former one. A ValueError
+  will be raised. Only when the former one has undefined output shapes, the
+  latter one can override.
+
+  NOTE: All batches passed to the layer can have different input shapes. But
+  these input shapes need to match with the output shapes set by either
+  `FeatureConfig` or build method except for ragged tensor. Only 2D
+  ragged tensor with output shape set to higher dimensions is allowed as
+  long as the total number of elements matches. All subsequent calls must have
+  the same input shapes. In the event that the input shapes cannot be
+  automatically determined by the enqueue method, you must call
+  the build method with the input shapes or provide output shapes in the
+  `FeatureConfig` to initialize the layer.
 
   To use this API on TPU you should use a custom training loop. Below is an
   example of a training and evaluation step:
@@ -277,6 +294,9 @@ def __init__(
         pipeline_execution_with_tensor_core)
 
     self._feature_config = feature_config
+    self._output_shapes = []
+    for feature in nest.flatten(feature_config):
+      self._output_shapes.append(feature.output_shape)
 
     # The TPU embedding ops are slightly inconsistent with how they refer to
     # tables:
@@ -336,9 +356,9 @@ def __init__(
       self._hosts = get_list_of_hosts(self._strategy)
 
     self._built = False
-    self._verify_batch_size_on_enqueue = True
+    self._verify_output_shapes_on_enqueue = True
 
-  def build(self, per_replica_batch_size: Optional[int] = None):
+  def build(self, per_replica_input_shapes=None, per_replica_batch_size=None):  # pylint:disable=g-bare-generic
     """Create the underlying variables and initializes the TPU for embeddings.
 
     This method creates the underlying variables (including slot variables). If
@@ -346,29 +366,34 @@ def build(self, per_replica_batch_size: Optional[int] = None):
     embeddings.
 
     This function will automatically get called by enqueue, which will try to
-    determine your batch size automatically. If this fails, you must manually
+    determine your output shapes. If this fails, you must manually
     call this method before you call enqueue.
 
     Args:
-      per_replica_batch_size: The per replica batch size that you intend to use.
-        Note that is fixed and the same batch size must be used for both
-        training and evaluation. If you want to calculate this from the global
-        batch size, you can use `num_replicas_in_sync` property of your strategy
-        object. May be set to None if not created under a TPUStrategy.
+      per_replica_input_shapes: A nested structure of The per replica input
+        shapes that matches the structure of the feature config. The input
+        shapes should be the same as the input shape of the feature (except for
+        ragged tensor) Note that it is fixed and the same per replica input
+        shapes must be used for both training and evaluation. If you want to
+        calculate this from the global input shapes, you can use
+        `num_replicas_in_sync` property of your strategy object. May be set to
+        None if not created under a TPUStrategy.
+      per_replica_batch_size: (Deprecated) The per replica batch size that you
+        intend to use. Note that is fixed and the same batch size must be used
+        for both training and evaluation. If you want to calculate this from the
+        global batch size, you can use `num_replicas_in_sync` property of your
+        strategy object. May be set to None if not created under a TPUStrategy.
 
     Raises:
-      ValueError: If per_replica_batch_size is None and object was created in a
-        TPUStrategy scope.
+      ValueError: If per_replica_input_shapes is inconsistent with the output
+      shapes stored in the feature config or the output shapes get from the
+      input shapes are not fully defined.
       RuntimeError: If tpu embedding is already initialized on TPU.
     """
     if self._built:
       return
 
     if self._using_tpu:
-      if per_replica_batch_size is None:
-        raise ValueError(
-            "When calling TpuShardedVariable.build under TpuStrategy you must "
-            "specify a per_replica_batch_size argument.")
       # If the tpu embedding is already initialized on TPU, raise runtime error.
       # Below logic is not added in `initialize_system_for_tpu_embedding`
       # because doing exception control flow in graph mode is difficult.
@@ -377,7 +402,8 @@ def build(self, per_replica_batch_size: Optional[int] = None):
             "TPU is already initialized for embeddings. This may be caused by "
             "using multiple TPUEmbedding instances in a TPU scope which is "
             "unsupported")
-      self._batch_size = per_replica_batch_size
+      self._get_and_update_output_shapes_from_input(per_replica_input_shapes,
+                                                    per_replica_batch_size)
 
       self._config_proto = self._create_config_proto()
 
@@ -402,7 +428,8 @@ def load_config():
     # This is internally conditioned self._built and self._using_tpu
     self._load_variables()
 
-  def _maybe_build(self, batch_size: Optional[int]):
+  def _maybe_build(self,
+                   output_shapes: Optional[Union[List[int], Iterable]] = None):  # pylint:disable=g-bare-generic
     if not self._built:
       # This can be called while tracing a function, so we wrap the
       # initialization code with init_scope so it runs eagerly, this means that
@@ -410,7 +437,73 @@ def _maybe_build(self, batch_size: Optional[int]):
       # we can be sure that we only initialize the TPU for embeddings exactly
       # once.
       with ops.init_scope():
-        self.build(batch_size)
+        self.build(output_shapes)
+
+  def _get_and_update_output_shapes_from_input(
+      self,
+      per_replica_input_shapes: Optional[List[TensorShape]] = None,
+      per_replica_batch_size: Optional[int] = None):
+    """Get and update the per replica output shapes from the input."""
+    per_replica_output_shapes = None
+    if per_replica_batch_size and per_replica_input_shapes is None:
+      logging.warning(
+          "per_replica_batch_size argument will be deprecated, please specify"
+          "all the input shapes using per_replica_input_shapes argument.")
+      per_replica_output_shapes = self._get_output_shapes_from_batch_size(
+          per_replica_batch_size)
+
+    # Update the input shapes if provided.
+    if per_replica_input_shapes is not None:
+      if isinstance(per_replica_input_shapes, int):
+        logging.warning(
+            "Passing batch size to per_replica_input_shapes argument will be"
+            " deprecated, please specify all the input shapes using"
+            " per_replica_input_shapes argument.")
+        per_replica_output_shapes = self._get_output_shapes_from_batch_size(
+            per_replica_input_shapes)
+      else:
+        nest.assert_same_structure(
+            nest.flatten(per_replica_input_shapes),
+            nest.flatten(self._feature_config))
+
+        # Convert the nested structure to list.
+        per_replica_input_shapes = nest.flatten(per_replica_input_shapes)
+
+        per_replica_output_shapes = self._get_output_shapes_from_input_shapes(
+            per_replica_input_shapes)
+
+    if per_replica_output_shapes is not None:
+
+      # Check the output shapes with existing output shapes setting.
+      self._check_output_shapes(per_replica_output_shapes)
+
+      # Update the output shapes with existing output shapes setting.
+      # This is necessary Because the output shapes might be missing from
+      # the feature config, the usr can set it:
+      #  1. calling the build method
+      #  2. output shapes auto detected when calling the dequeue method for
+      #     for the first time. The dequeue method will call build method
+      #     with the output shapes.
+      # Either these two situations will lead to an update to the existing
+      # output shapes.
+      self._update_output_shapes(per_replica_output_shapes)
+
+    # Check if the output shapes are fully defined. This is required in order
+    # to set them in the feature descriptor field of the tpu embedding config
+    # proto.
+    self._check_output_shapes_fully_defined()
+
+  def _get_output_shapes_from_input_shapes(
+      self, input_shapes: List[TensorShape]) -> List[TensorShape]:
+    """Get output shapes from the flattened input shapes list."""
+    output_shapes = []
+    for input_shape in input_shapes:
+      if input_shape.rank is None or input_shape.rank < 2:
+        raise ValueError(
+            "Received input tensor of shape {}. Rank must be 2 and above"
+            .format(input_shape))
+      output_shapes.append(input_shape[:-1])
+    return output_shapes
 
   @property
   def embedding_tables(
@@ -459,6 +552,9 @@ def _create_config_proto(
     """
 
     config_proto = tpu_embedding_configuration_pb2.TPUEmbeddingConfiguration()
+    # The tensor core batch size should be the GCD of all the input batch size.
+    tensor_core_batch_size = self._get_tensor_core_batch_size(
+        self._output_shapes)
 
     # There are several things that need to be computed here:
     # 1. Each table has a num_features, which corresponds to the number of
@@ -470,9 +566,9 @@ def _create_config_proto(
     #    extremely bad performance characteristics. The more separate
     #    optimization configurations we have, the worse the performance will be.
     num_features = {table: 0 for table in self._table_config}
-    for feature in nest.flatten(self._feature_config):
-      num_features[feature.table] += (1 if feature.max_sequence_length == 0
-                                      else feature.max_sequence_length)
+    for i, feature in enumerate(nest.flatten(self._feature_config)):
+      num_features[feature.table] += math_ops.reduce_prod(
+          self._output_shapes[i]) / tensor_core_batch_size
 
     # Map each callable dynamic learning rate to its in index in the list.
     learning_rate_index = {r: i for i, r in enumerate(
@@ -504,11 +600,27 @@ def _create_config_proto(
       # Use optimizer to handle the rest of the parameters.
       table.optimizer._set_optimization_parameters(parameters)  # pylint: disable=protected-access
 
+    table_to_id = {table: i for i, table in enumerate(self._table_config)}
+
+    # Set feature descriptor field in the config proto.
+    for feature, output_shape in zip(
+        nest.flatten(self._feature_config), self._output_shapes):
+      feature_descriptor = config_proto.feature_descriptor.add()
+
+      if feature.name:
+        feature_descriptor.name = feature.name
+
+      feature_descriptor.table_id = table_to_id[feature.table]
+      # The input shape of the feature is the actual shape of the input tensor
+      # except the last dimension because the last dimension will always be
+      # reduced.
+      feature_descriptor.input_shape.extend(output_shape.as_list())
+
     # Always set mode to training, we override the mode during enqueue.
     config_proto.mode = (
         tpu_embedding_configuration_pb2.TPUEmbeddingConfiguration.TRAINING)
 
-    config_proto.batch_size_per_tensor_core = self._batch_size
+    config_proto.batch_size_per_tensor_core = tensor_core_batch_size
     config_proto.num_hosts = self._strategy.extended.num_hosts
     config_proto.num_tensor_cores = self._strategy.num_replicas_in_sync
 
@@ -520,64 +632,6 @@ def _create_config_proto(
 
     return config_proto
 
-  def _compute_per_table_gradients(
-      self,
-      gradients
-  ) -> Dict[Text, List[core.Tensor]]:
-    """Computes a dict of lists of gradients, keyed by table name.
-
-    Args:
-      gradients: A nested structure of Tensors (and Nones) with the same
-        structure as the feature config.
-
-    Returns:
-      A dict of lists of tensors, keyed by the table names, containing the
-    gradients in the correct order with None gradients replaced by zeros.
-    """
-
-    nest.assert_same_structure(self._feature_config, gradients)
-
-    per_table_gradients = {table: [] for table in self._table_config}
-    for (path, gradient), feature in zip(
-        nest.flatten_with_joined_string_paths(gradients),
-        nest.flatten(self._feature_config)):
-      if gradient is not None and not isinstance(gradient, ops.Tensor):
-        raise ValueError(
-            f"When computing per-table gradients, found non-tensor type: "
-            f"{type(gradient)} at path {path}.")
-
-      # Expected tensor shape differs for sequence and non-sequence features.
-      if feature.max_sequence_length > 0:
-        shape = [self._batch_size, feature.max_sequence_length,
-                 feature.table.dim]
-      else:
-        shape = [self._batch_size, feature.table.dim]
-
-      if gradient is not None:
-        if gradient.shape != shape:
-          raise ValueError("Found gradient of shape {} at path {}. Expected "
-                           "shape {}.".format(gradient.shape, path, shape))
-
-        # We expand dims on non-sequence features so that all features are
-        # of rank 3 and we can concat on axis=1.
-        if len(shape) == 2:
-          gradient = array_ops.expand_dims(gradient, axis=1)
-      else:
-        # No gradient for this feature, since we must give a gradient for all
-        # features, pass in a zero tensor here. Note that this is not correct
-        # for all optimizers.
-        logging.warn("No gradient passed for feature %s, sending zero "
-                     "gradient. This may not be correct behavior for certain "
-                     "optimizers like Adam.", path)
-        # Create a shape to mimic the expand_dims above for non-sequence
-        # features.
-        if len(shape) == 2:
-          shape = [shape[0], 1, shape[1]]
-        gradient = array_ops.zeros(shape, dtype=dtypes.float32)
-      per_table_gradients[feature.table].append(gradient)
-
-    return per_table_gradients
-
   def apply_gradients(self, gradients, name: Optional[Text] = None):
     """Applies the gradient update to the embedding tables.
 
@@ -642,31 +696,39 @@ def tpu_step(tpu_features):
                          "object. Please either call enqueue first or manually "
                          "call the build method.")
 
-    # send_tpu_embedding_gradients requires per table gradient, if we only have
-    # one feature per table this isn't an issue. When multiple features share
-    # the same table, the order of the features in per table tensor returned by
-    # recv_tpu_embedding_activations matches the order in which they were passed
-    # to enqueue.
-    # In all three places, we use the fixed order given by nest.flatten to have
-    # a consistent feature order.
-
-    # First construct a dict of tensors one for each table.
-    per_table_gradients = self._compute_per_table_gradients(gradients)
-
-    # Now that we have a list of gradients we can compute a list of gradients
-    # in the fixed order of self._table_config which interleave the gradients of
-    # the individual features. We concat on axis 1 and then reshape into a 2d
-    # tensor. The send gradients op expects a tensor of shape
-    # [num_features*batch_size, dim] for each table.
-    interleaved_gradients = []
-    for table in self._table_config:
-      interleaved_gradients.append(array_ops.reshape(
-          array_ops.concat(per_table_gradients[table], axis=1),
-          [-1, table.dim]))
+    nest.assert_same_structure(self._feature_config, gradients)
+    updated_gradients = []
+    for (path, gradient), feature, output_shape in zip(
+        nest.flatten_with_joined_string_paths(gradients),
+        nest.flatten(self._feature_config), self._output_shapes):
+      full_output_shape = list(output_shape) + [feature.table.dim]
+      if gradient is not None and not isinstance(gradient, ops.Tensor):
+        raise ValueError(
+            f"found non-tensor type: {type(gradient)} at path {path}.")
+      if gradient is not None:
+        if gradient.shape != full_output_shape:
+          raise ValueError("Found gradient of shape {} at path {}. Expected "
+                           "shape {}.".format(gradient.shape, path,
+                                              full_output_shape))
+      else:
+        # No gradient for this feature, since we must give a gradient for all
+        # features, pass in a zero tensor here. Note that this is not correct
+        # for all optimizers.
+        logging.warning(
+            "No gradient passed for feature %s, sending zero "
+            "gradient. This may not be correct behavior for certain "
+            "optimizers like Adam.", path)
+        gradient = array_ops.zeros(full_output_shape, dtype=dtypes.float32)
+      # Some gradients can be passed with op which shape is not correctly set.
+      # This ensures that the shape of the gradient is correctly set.
+      updated_gradients.append(
+          array_ops.reshape(gradient, shape=gradient.shape))
     op = tpu_ops.send_tpu_embedding_gradients(
-        inputs=interleaved_gradients,
-        learning_rates=[math_ops.cast(fn(), dtype=dtypes.float32)
-                        for fn in self._dynamic_learning_rates],
+        inputs=updated_gradients,
+        learning_rates=[
+            math_ops.cast(fn(), dtype=dtypes.float32)
+            for fn in self._dynamic_learning_rates
+        ],
         config=self._config_proto.SerializeToString())
 
     # Apply the name tag to the op.
@@ -678,11 +740,14 @@ def dequeue(self, name: Optional[Text] = None):
 
     Returns a nested structure of `tf.Tensor` objects, matching the structure of
     the `feature_config` argument to the `TPUEmbedding` class. The output shape
-    of the tensors is `(batch_size, dim)`, where `batch_size` is the per core
-    batch size, `dim` is the dimension of the corresponding `TableConfig`. If
-    the feature's corresponding `FeatureConfig` has `max_sequence_length`
-    greater than 0, the output will be a sequence of shape
-    `(batch_size, max_sequence_length, dim)` instead.
+    of the tensors is `(*output_shape, dim)`, `dim` is the dimension of the
+    corresponding `TableConfig`. For output_shape, there are three places where
+    it can be set.
+      1. FeatureConfig provided in the __init__ function.
+      2. Per_replica_output_shapes by directly calling the build method
+           after initializing the tpu embedding class.
+      3. Auto detected from the shapes of the input feature.
+    The priority of these places is the exact same order.
 
     ```python
     strategy = tf.distribute.TPUStrategy(...)
@@ -735,56 +800,17 @@ def tpu_step(tpu_features):
                          "Please either call enqueue first or manually call "
                          "the build method.")
 
-    # The activations returned by this op are per table. So we must separate
-    # them out into per feature activations. The activations are interleaved:
-    # for each table, we expect a [num_features*batch_size, dim] tensor.
-    # E.g. we expect the slice [:num_features, :] to contain the lookups for the
-    # first example of all features using this table.
+    # The activations returned by this op are per feature.
     activations = tpu_ops.recv_tpu_embedding_activations(
-        num_outputs=len(self._table_config),
+        num_outputs=len(self._config_proto.feature_descriptor),
         config=self._config_proto.SerializeToString())
 
     # Apply the name tag to the op.
     if name is not None:
       _add_key_attr(activations[0].op, name)
 
-    # Compute the number of features for this  table.
-    num_features = {table: 0 for table in self._table_config}
-    for feature in nest.flatten(self._feature_config):
-      num_features[feature.table] += (1 if feature.max_sequence_length == 0
-                                      else feature.max_sequence_length)
-
-    # Activations are reshaped so that they are indexed by batch size and then
-    # by the 'feature' index within the batch. The final dimension should equal
-    # the dimension of the table.
-    table_to_activation = {
-        table: array_ops.reshape(activation,
-                                 [self._batch_size, num_features[table], -1])
-        for table, activation in zip(self._table_config, activations)}
-
-    # We process the features in the same order we enqueued them.
-    # For each feature we take the next slice of the activations, so need to
-    # track the activations and the current position we are in.
-    table_to_position = {table: 0 for table in self._table_config}
-
-    per_feature_activations = []
-    for feature in nest.flatten(self._feature_config):
-      activation = table_to_activation[feature.table]
-      feature_index = table_to_position[feature.table]
-      # We treat non-sequence and sequence features differently here as sequence
-      # features have rank 3 while non-sequence features have rank 2.
-      if feature.max_sequence_length == 0:
-        per_feature_activations.append(
-            activation[:, feature_index, :])
-        table_to_position[feature.table] += 1
-      else:
-        per_feature_activations.append(
-            activation[:, feature_index:(
-                feature_index+feature.max_sequence_length), :])
-        table_to_position[feature.table] += feature.max_sequence_length
-
     # Pack the list back into the same nested structure as the features.
-    return nest.pack_sequence_as(self._feature_config, per_feature_activations)
+    return nest.pack_sequence_as(self._feature_config, activations)
 
   def _create_variables_and_slots(
       self
@@ -904,12 +930,19 @@ def _add_data_for_tensor(self, tensor, weight, indices, values, weights,
           "Weight will always be 1 in this case.".format(path))
     # For tensors, there are no indices and no weights.
     indices.append(int_zeros)
-    values.append(math_ops.cast(tensor, dtypes.int32))
+    values.append(math_ops.cast(array_ops.reshape(tensor, [-1]), dtypes.int32))
     weights.append(float_zeros)
 
   def _add_data_for_sparse_tensor(self, tensor, weight, indices, values,
-                                  weights, int_zeros, float_zeros, path):
-    indices.append(math_ops.cast(tensor.indices, dtypes.int32))
+                                  weights, int_zeros, float_zeros, path,
+                                  feature):
+    sample_indices = math_ops.cast(tensor.indices, dtypes.int32)
+    if tensor.shape.rank == 2:
+      if not feature.output_shape and feature.max_sequence_length > 0:
+        # Add one dimension to the last axis.
+        sample_indices = array_ops.pad(
+            sample_indices, paddings=[[0, 0], [0, 1]])
+    indices.append(sample_indices)
     values.append(math_ops.cast(tensor.values, dtypes.int32))
     # If we have weights they must be a SparseTensor.
     if weight is not None:
@@ -921,9 +954,10 @@ def _add_data_for_sparse_tensor(self, tensor, weight, indices, values,
     else:
       weights.append(float_zeros)
 
-  def _add_data_for_ragged_tensor(self, tensor, weight, indices, values,
-                                  weights, int_zeros, float_zeros, path):
-    indices.append(math_ops.cast(tensor.row_splits, dtypes.int32))
+  def _add_data_for_ragged_tensor(self, tensor, weight, row_lengths, values,
+                                  weights, int_zeros, float_zeros, path,
+                                  feature):
+    row_lengths.append(math_ops.cast(tensor.row_lengths(), dtypes.int32))
     values.append(math_ops.cast(tensor.values, dtypes.int32))
     # If we have weights they must be a RaggedTensor.
     if weight is not None:
@@ -956,36 +990,14 @@ def _generate_enqueue_op(
     Returns:
       The enqueue op.
     """
-
-    # First we need to understand which op to use. This depends on if sparse
-    # or ragged tensors are in the flat_inputs.
-    sparse = False
-    ragged = False
-    for inp in flat_inputs:
-      if isinstance(inp, sparse_tensor.SparseTensor):
-        sparse = True
-      elif isinstance(inp, ragged_tensor.RaggedTensor):
-        ragged = True
-    if sparse and ragged:
-      raise ValueError(
-          "Found both SparseTensors and RaggedTensors in the input to the "
-          "enqueue operation. Please ensure that your data does not include "
-          "both SparseTensors and RaggedTensors. It is ok to have Tensors in "
-          "combination with one of the previous types.")
-
     # Combiners are per table, list in the same order as the table order.
     combiners = [table.combiner for table in self._table_config]
 
-    # Reverse mapping of self._table_config, so that we can lookup the table
-    # index.
-    table_to_id = {table: i for i, table in enumerate(self._table_config)}
-
     # These parallel arrays will be the inputs to the enqueue op.
-    indices = []  # sample_indices for sparse, sample_splits for ragged.
+    # sample_indices for sparse, row_lengths for ragged.
+    indices_or_row_lengths = []
     values = []
     weights = []
-    table_ids = []
-    max_sequence_lengths = []
 
     # We have to supply a empty/zero tensor in a list position where we don't
     # have data (e.g. indices for standard Tensor input, weight when no weight
@@ -1001,41 +1013,29 @@ def _generate_enqueue_op(
     # early.
     for inp, weight, (path, feature) in zip(
         flat_inputs, flat_weights, flat_features):
-      table_ids.append(table_to_id[feature.table])
-      max_sequence_lengths.append(feature.max_sequence_length)
       if isinstance(inp, ops.Tensor):
-        self._add_data_for_tensor(inp, weight, indices, values, weights,
-                                  int_zeros, float_zeros, path)
+        self._add_data_for_tensor(inp, weight, indices_or_row_lengths, values,
+                                  weights, int_zeros, float_zeros, path)
       elif isinstance(inp, sparse_tensor.SparseTensor):
-        self._add_data_for_sparse_tensor(inp, weight, indices, values, weights,
-                                         int_zeros, float_zeros, path)
+        self._add_data_for_sparse_tensor(inp, weight, indices_or_row_lengths,
+                                         values, weights, int_zeros,
+                                         float_zeros, path, feature)
       elif isinstance(inp, ragged_tensor.RaggedTensor):
-        self._add_data_for_ragged_tensor(inp, weight, indices, values, weights,
-                                         int_zeros, float_zeros, path)
+        self._add_data_for_ragged_tensor(inp, weight, indices_or_row_lengths,
+                                         values, weights, int_zeros,
+                                         float_zeros, path, feature)
       else:
         raise ValueError("Input {} is of unknown type {}. Please only pass "
                          "Tensor, SparseTensor or RaggedTensor as input to "
                          "enqueue.".format(path, type(inp)))
 
-    if ragged:
-      return tpu_ops.enqueue_tpu_embedding_ragged_tensor_batch(
-          sample_splits=indices,
-          embedding_indices=values,
-          aggregation_weights=weights,
-          mode_override=mode_override,
-          device_ordinal=device_ordinal,
-          combiners=combiners,
-          table_ids=table_ids,
-          max_sequence_lengths=max_sequence_lengths)
-    return tpu_ops.enqueue_tpu_embedding_sparse_tensor_batch(
-        sample_indices=indices,
+    return tpu_ops.enqueue_tpu_embedding_arbitrary_tensor_batch(
+        sample_indices_or_row_lengths=indices_or_row_lengths,
         embedding_indices=values,
         aggregation_weights=weights,
         mode_override=mode_override,
         device_ordinal=device_ordinal,
-        combiners=combiners,
-        table_ids=table_ids,
-        max_sequence_lengths=max_sequence_lengths)
+        combiners=combiners)
 
   def _raise_error_for_incorrect_control_flow_context(self):
     """Raises an error if we are not in the TPUReplicateContext."""
@@ -1135,13 +1135,36 @@ def enqueue(
     """Enqueues id tensors for embedding lookup.
 
     This function enqueues a structure of features to be looked up in the
-    embedding tables. We expect that the batch size of each of the tensors in
-    features matches the per core batch size. This will automatically happen if
-    your input dataset is batched to the global batch size and you use
+    embedding tables. We expect that the input shapes of each of the tensors in
+    features matches the output shapes set via FeatureConfig or build method
+    (if any). the output shapes will be auto detected based on the input shapes
+    with the max_sequence_length or output shape setting in the FeatureConfig.
+    Note that the output shapes is based on per replica batch size.
+    If your input dataset is batched to the global batch size and you use
     `tf.distribute.TPUStrategy`'s `experimental_distribute_dataset`
     or if you use `distribute_datasets_from_function` and batch
     to the per core batch size computed by the context passed to your input
-    function.
+    function, the output shapes should match automatically.
+
+    The auto detected the output shapes:
+      1. For dense tensor, make sure the tensor has last dimension as 1. The
+         output shape will be the input shape excluding the last dimension.
+      2. For sparse tensor, make sure the tensor has rank 2 and above.
+           a. If feature config has max_sequence_length equals 0 or output shape
+              set (the max_sequence_length setting will be ignored), the
+              output shape will be the input shape excluding the last dimension.
+           b. Otherwize if the tensor is rank 2, the output shape will be input
+              shape  with last dimension set as max_sequence_length. If the
+              tensor is above rank 2, the output shape will be the input shape
+              excluding the last dimension and the last dimension of the output
+              shape will be set to max_sequence_length.
+      3. For ragged tensor, make sure the tensor has rank 2.
+           a. If feature config has max_sequence_length equals 0 or output shape
+              set (the max_sequence_length setting will be ignored), the
+              output shape will be the input shape excluding the last dimension.
+           b. Otherwise, the output shape will be the input shape excluding the
+              last dimension and the last dimension of the output shape will be
+              set to max_sequence_length.
 
     ```python
     strategy = tf.distribute.TPUStrategy(...)
@@ -1226,7 +1249,8 @@ def per_core_enqueue(ctx):
         directly taken from the args of the `strategy.run` call. Also if
         the size of any sequence in `features` does not match corresponding
         sequence in `feature_config`. Similarly for `weights`, if not `None`.
-        If batch size of features is unequal or different from a previous call.
+        If input shapes of features is unequal or different from a previous
+        call.
       RuntimeError: When called inside a strategy.run call and inside XLA
         control flow. If batch_size is not able to be determined and build was
         not called.
@@ -1240,29 +1264,22 @@ def per_core_enqueue(ctx):
 
     in_tpu_context = self._raise_error_for_incorrect_control_flow_context()
 
-    if not self._verify_batch_size_on_enqueue:
-      if not self._batch_size or not self._built:
+    nest.assert_same_structure(self._feature_config, features)
+
+    if not self._verify_output_shapes_on_enqueue:
+      if not self._output_shapes or not self._built:
         raise ValueError(
-            "Configured not to check batch size on each enqueue() call; please "
-            "ensure build() was called with global batch size to initialize "
+            "Configured not to check output shapes on each enqueue() call; please "
+            "ensure build() was called with output shapes to initialize "
             "the TPU for embeddings.")
     else:
-      # Should we also get batch_size from weights if they exist?
-      # Since features is assumed to be batched at the per replica batch size
-      # the returned batch size here is per replica an not global.
-      batch_size = self._get_batch_size(features, in_tpu_context)
-      if batch_size is None and not self._built:
-        raise RuntimeError("Unable to determine batch size from input features."
-                           "Please call build() with global batch size to "
-                           "initialize the TPU for embeddings.")
-      if batch_size is not None:
-        self._maybe_build(batch_size)
-        if self._batch_size != batch_size:
-          raise ValueError("Multiple calls to enqueue with different batch "
-                           "sizes {} and {}.".format(self._batch_size,
-                                                     batch_size))
+      input_shapes = self._get_input_shapes(features, in_tpu_context)
 
-    nest.assert_same_structure(self._feature_config, features)
+      self._maybe_build(input_shapes)
+      # If is already built, we still need to check if the output shapes matches
+      # with the previous ones.
+      self._check_output_shapes(
+          self._get_output_shapes_from_input_shapes(input_shapes))
 
     flat_inputs = nest.flatten(features)
     flat_weights = [None] * len(flat_inputs)
@@ -1287,7 +1304,6 @@ def generate_enqueue_ops():
         mode_override = array_ops.where_v2(training,
                                            constant_op.constant("train"),
                                            constant_op.constant("inference"))
-
         # Device ordinal is -1 here, a later rewrite will fix this once the op
         # is expanded by outside compilation.
         enqueue_op = self._generate_enqueue_op(
@@ -1321,6 +1337,7 @@ def generate_enqueue_ops():
         # the device ordinal is the last number
         device_ordinal = (
             tf_device.DeviceSpec.from_string(tpu_device).device_index)
+
         with ops.device(device_util.get_host_for_device(tpu_device)):
           enqueue_op = self._generate_enqueue_op(
               replica_inputs, replica_weights, flat_features,
@@ -1337,6 +1354,7 @@ def generate_enqueue_ops():
       if device_spec.device_type != "TPU":
         raise ValueError(
             "Non-TPU device {} passed to enqueue.".format(device))
+
       with ops.device(device_util.get_host_for_device(device)):
         enqueue_op = self._generate_enqueue_op(
             flat_inputs, flat_weights, flat_features,
@@ -1348,33 +1366,159 @@ def generate_enqueue_ops():
           _add_key_attr(enqueue_op, name)
         ops.get_default_graph().control_outputs.append(enqueue_op)
 
-  def _get_batch_size(self, tensors, in_tpu_context: bool):
-    """Gets the batch size from a nested structure of features."""
-    batch_size = None
-    for path, maybe_tensor in nest.flatten_with_joined_string_paths(tensors):
-      tensor_list = []
+  def _get_input_shapes(self, tensors,
+                        in_tpu_context: bool) -> List[TensorShape]:
+    """Get the input shapes from the input tensor."""
+    input_shapes = []
+    for (path, maybe_tensor), feature in zip(
+        nest.flatten_with_joined_string_paths(tensors),
+        nest.flatten(self._feature_config)):
       if not in_tpu_context:
-        # if we are not in a context, then this is PerReplica and we need to
-        # check each replica's batch size.
-        for replica_id in range(self._strategy.num_replicas_in_sync):
-          tensor_list.append(distribute_utils.select_replica(replica_id,
-                                                             maybe_tensor))
+        tensor = distribute_utils.select_replica(0, maybe_tensor)
       else:
-        tensor_list = [maybe_tensor]
-
-      for tensor in tensor_list:
-        if tensor.shape.rank < 1:
+        tensor = maybe_tensor
+
+      if isinstance(tensor, ops.Tensor):
+        input_shapes.append(
+            self._get_input_shape_for_tensor(tensor, feature, path))
+      elif isinstance(tensor, sparse_tensor.SparseTensor):
+        input_shapes.append(
+            self._get_input_shape_for_sparse_tensor(tensor, feature, path))
+      elif isinstance(tensor, ragged_tensor.RaggedTensor):
+        input_shapes.append(
+            self._get_input_shape_for_ragged_tensor(tensor, feature, path))
+    return input_shapes
+
+  def _get_input_shape_for_tensor(self, tensor, feature, path) -> TensorShape:
+    """Get the input shape for the dense tensor."""
+    shape = tensor.shape.as_list()
+    if len(shape) < 1:
+      raise ValueError("Only rank 1 and above dense tensor is supported,"
+                       " find rank {} sparse tensor for input {}".format(
+                           len(shape), path))
+    if shape[-1] != 1:
+      return TensorShape(shape + [1])
+    return TensorShape(shape)
+
+  def _get_input_shape_for_sparse_tensor(self, tensor, feature,
+                                         path) -> TensorShape:
+    """Get the input shape for the sparse tensor."""
+    shape = tensor.shape.as_list()
+    # Only 2 and above rank sparse tensor is supported.
+    if len(shape) < 2:
+      raise ValueError("Only rank 2 and above sparse tensor is supported,"
+                       " find rank {} sparse tensor for input {}".format(
+                           len(shape), path))
+    if not feature.output_shape and feature.max_sequence_length > 0:
+      # If the max_sequence_length is set and the output shape for FeatureConfig
+      # is not set, we modify the shape of the input feature. Only rank 2
+      # feature output shape is modified
+      if len(shape) == 2:
+        # If the sparse tensor is 2D and max_sequence_length is set,
+        # we need to add one dimension to the input feature.
+        shape.insert(len(shape) - 1, feature.max_sequence_length)
+
+    return TensorShape(shape)
+
+  def _get_input_shape_for_ragged_tensor(self, tensor, feature,
+                                         path) -> TensorShape:
+    """Get the input shape for the ragged tensor."""
+    shape = tensor.shape.as_list()
+    # Only rank 2 ragged tensor is supported.
+    if len(shape) != 2:
+      raise ValueError("Only rank 2 ragged tensor is supported,"
+                       " find rank {} ragged tensor for input {}".format(
+                           len(shape), path))
+    if not feature.output_shape and feature.max_sequence_length > 0:
+      # If the max_sequence_length is set and the output shape for FeatureConfig
+      # is not set, add the sequence length as second last dimension of
+      # the ragged tensor.
+      shape.insert(len(shape) - 1, feature.max_sequence_length)
+
+    return TensorShape(shape)
+
+  def _get_tensor_core_batch_size(self, output_shapes):
+    """Get the tensor core batch size based on the output shapes."""
+    tensor_core_batch_size = math_ops.reduce_prod(output_shapes[0])
+    for output_shape in output_shapes[1:]:
+      tensor_core_batch_size = numpy_ops.gcd(tensor_core_batch_size,
+                                             math_ops.reduce_prod(output_shape))
+    return tensor_core_batch_size
+
+  def _update_output_shapes(self, incoming_output_shapes: List[TensorShape]):
+    """Update the existing output shapes based on the new output shapes.
+
+    The existing output shapes always have higher piority than the new incoming
+    output shapes.
+    Args:
+      incoming_output_shapes: nested structure of TensorShape to override the
+        existing output shapes.
+    """
+    nest.assert_same_structure(self._output_shapes, incoming_output_shapes)
+    updated_output_shapes = []
+    for old_output_shape, incoming_output_shape in zip(self._output_shapes,
+                                                       incoming_output_shapes):
+      if old_output_shape:
+        updated_output_shapes.append(old_output_shape)
+      else:
+        updated_output_shapes.append(incoming_output_shape)
+    self._output_shapes = updated_output_shapes
+
+  def _check_output_shapes(self, incoming_output_shapes: List[TensorShape]):
+    """Check the incoming output shapes against the output shapes stored."""
+    # The incoming output shape should have the same structure with the existing
+    # output shapes.
+    nest.assert_same_structure(self._output_shapes, incoming_output_shapes)
+
+    for (path, feature), old_output_shape, incoming_output_shape in zip(
+        nest.flatten_with_joined_string_paths(self._feature_config),
+        self._output_shapes, incoming_output_shapes):
+      # First check if both shapes are not None.
+      if old_output_shape and incoming_output_shape:
+        # We skip the check when the incoming output shape is rank 1 or 2 and
+        # rank of the old output shape is larger. This can happen for
+        # (sequence) ragged tensor, we push the check down to the enqueue op.
+        if (len(incoming_output_shape) == 1 or len(incoming_output_shape)
+            == 2) and len(old_output_shape) > len(incoming_output_shape):
+          continue
+        if len(old_output_shape) != len(
+            incoming_output_shape) or not self._is_tensor_shape_match(
+                old_output_shape, incoming_output_shape):
           raise ValueError(
-              "Input {} has rank 0, rank must be at least 1.".format(path))
-        shape = tensor.shape.as_list()
-        if shape[0] is not None:
-          if batch_size is None:
-            batch_size = shape[0]
-          elif batch_size != shape[0]:
-            raise ValueError("Found multiple batch sizes {} and {}. All inputs "
-                             "must have the same batch dimensions size.".format(
-                                 batch_size, shape[0]))
-    return batch_size
+              f"Inconsistent shape founded for input feature {path}, "
+              f"Output shape is set to be {old_output_shape}, "
+              f"But got incoming output shape {incoming_output_shape}")
+
+  def _check_output_shapes_fully_defined(self):
+    """Check if the output shape is fully defined."""
+    for (path, feature), output_shape in zip(
+        nest.flatten_with_joined_string_paths(self._feature_config),
+        self._output_shapes):
+      if not output_shape.is_fully_defined():
+        raise ValueError(
+            f"Input Feature {path} has output shape set as"
+            f"{output_shape} which is not fully defined. "
+            "Please specify the fully defined shape in either FeatureConfig"
+            "or for the build method.")
+
+  def _is_tensor_shape_match(self, shape_a: TensorShape,
+                             shape_b: TensorShape) -> bool:
+    """Check if shape b matches with shape a."""
+    for s_a, s_b in zip(shape_a.as_list(), shape_b.as_list()):
+      if s_a and s_b and s_a != s_b:
+        return False
+    return True
+
+  def _get_output_shapes_from_batch_size(self, per_replica_batch_size):
+    """Get the output shapes from the batch size."""
+    output_shapes = []
+    for feature in nest.flatten(self._feature_config):
+      if not feature.output_shape and feature.max_sequence_length > 0:
+        output_shapes.append(
+            TensorShape([per_replica_batch_size, feature.max_sequence_length]))
+      else:
+        output_shapes.append(TensorShape(per_replica_batch_size))
+    return output_shapes
 
 
 @def_function.function
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_correctness_test.py b/tensorflow/python/tpu/tpu_embedding_v2_correctness_test.py
index df1f232083d037..dabbd69e494041 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_correctness_test.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_correctness_test.py
@@ -32,6 +32,7 @@
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework.tensor_shape import TensorShape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import init_ops_v2
@@ -120,6 +121,33 @@ def setUp(self):
     self.feature_friends_row_lengths = [1, 3, 1, 3]
     self.resolver = None
 
+    # Basically we are expand the dims of the old feature by 1 and repeat
+    # batch size times for the first dimension.
+    def create_hight_dimensional_indices(indices):
+      indices = np.array(indices, dtype=np.int32)
+      batch_size_index = np.repeat(
+          np.arange(self.data_batch_size), len(indices)).reshape(-1, 1)
+      repeated_indices = np.tile(indices, (self.data_batch_size, 1))
+      return np.concatenate([batch_size_index, repeated_indices], axis=1)
+
+    # Create high dimensional features with shape(4, 4, 2)
+    self.feature_watched_indices_high_dimensional = create_hight_dimensional_indices(
+        self.feature_watched_indices)
+    self.feature_watched_values_high_dimensional = self.feature_watched_values * self.data_batch_size
+    self.feature_watched_row_lengths_high_dimensional = self.feature_watched_row_lengths * self.data_batch_size
+
+    # Create high dimensional features with shape(4, 4, 2)
+    self.feature_favorited_indices_high_dimensional = create_hight_dimensional_indices(
+        self.feature_favorited_indices)
+    self.feature_favorited_values_high_dimensional = self.feature_favorited_values * self.data_batch_size
+    self.feature_favorited_row_lengths_high_dimensional = self.feature_favorited_row_lengths * self.data_batch_size
+
+    # Create high dimensional features with shape(4, 4, 3)
+    self.feature_friends_indices_high_dimensional = create_hight_dimensional_indices(
+        self.feature_friends_indices)
+    self.feature_friends_values_high_dimensional = self.feature_friends_values * self.data_batch_size
+    self.feature_friends_row_lengths_high_dimensional = self.feature_friends_row_lengths * self.data_batch_size
+
   def _get_strategy(self):
     self.resolver = tpu_cluster_resolver.TPUClusterResolver(
         tpu=FLAGS.tpu, zone=FLAGS.zone, project=FLAGS.project)
@@ -145,16 +173,38 @@ def _create_strategy_and_mid_level(self, optimizer_name):
 
     return strategy, mid_level_api, optimizer
 
-  @parameterized.parameters(*itertools.product(
-      ['sgd', 'adagrad', 'adam', 'ftrl'], [True, False], [True, False]))
-  def test_embedding(self, optimizer_name, training, sparse):
+  @parameterized.parameters(
+      *itertools.product(['sgd', 'adagrad', 'adam', 'ftrl'], [True, False],
+                         [True, False], [True, False]))
+  def test_embedding(self, optimizer_name, training, sparse,
+                     is_high_dimensional):
     strategy, mid_level_api, optimizer = (
         self._create_strategy_and_mid_level(optimizer_name))
 
     if sparse:
-      dataset = self._create_sparse_dataset(strategy)
+      if is_high_dimensional:
+        dataset = self._create_high_dimensional_sparse_dataset(strategy)
+      else:
+        dataset = self._create_sparse_dataset(strategy)
     else:
-      dataset = self._create_ragged_dataset(strategy)
+      if is_high_dimensional:
+        dataset = self._create_high_dimensional_sparse_dataset(strategy)
+      else:
+        dataset = self._create_ragged_dataset(strategy)
+
+    if is_high_dimensional:
+      if sparse:
+        mid_level_api.build([
+            TensorShape([self.batch_size, self.data_batch_size, 2]),
+            TensorShape([self.batch_size, self.data_batch_size, 2]),
+            TensorShape([self.batch_size, self.data_batch_size, 3]),
+        ])
+      else:
+        mid_level_api.build([
+            TensorShape([self.batch_size, self.data_batch_size, None]),
+            TensorShape([self.batch_size, self.data_batch_size, None]),
+            TensorShape([self.batch_size, self.data_batch_size, None]),
+        ])
 
     dist = strategy.experimental_distribute_dataset(
         dataset,
@@ -193,12 +243,16 @@ def step():
     mid_level_api._retrieve_variables()
 
     # Compute sparse tensors for global batch.
-    input_data = next(iter(self._create_sparse_dataset(strategy)))
+    if is_high_dimensional:
+      input_data = next(
+          iter(self._create_high_dimensional_sparse_dataset(strategy)))
+    else:
+      input_data = next(iter(self._create_sparse_dataset(strategy)))
 
     # Check results.
     self._check_results(strategy, shard_out_val, training, input_data,
-                        mid_level_api._variables,
-                        optimizer)
+                        mid_level_api._variables, optimizer,
+                        is_high_dimensional)
 
   def _create_mid_level(self, optimizer=None):
     # Create `TPUEmbedding` object.
@@ -245,6 +299,69 @@ def _create_sparse_dataset(self, strategy, include_weights=False, weight=0.5):
     return dataset.unbatch().repeat().batch(
         self.batch_size * strategy.num_replicas_in_sync, drop_remainder=True)
 
+  def _create_high_dimensional_sparse_dataset(self,
+                                              strategy,
+                                              include_weights=False,
+                                              weight=0.5):
+    sparse_features = (
+        sparse_tensor.SparseTensor(
+            indices=self.feature_watched_indices_high_dimensional,
+            values=self.feature_watched_values_high_dimensional,
+            dense_shape=[self.data_batch_size, self.data_batch_size, 2]),
+        sparse_tensor.SparseTensor(
+            indices=self.feature_favorited_indices_high_dimensional,
+            values=self.feature_favorited_values_high_dimensional,
+            dense_shape=[self.data_batch_size, self.data_batch_size, 2]),
+        sparse_tensor.SparseTensor(
+            indices=self.feature_friends_indices_high_dimensional,
+            values=self.feature_friends_values_high_dimensional,
+            dense_shape=[self.data_batch_size, self.data_batch_size, 3]))
+    if include_weights:
+      weights = []
+      for sparse in sparse_features:
+        values = (
+            array_ops.ones_like(sparse.values, dtype=dtypes.float32) * weight)
+        weights.append(
+            sparse_tensor.SparseTensor(
+                indices=sparse.indices,
+                values=values,
+                dense_shape=sparse.dense_shape))
+      sparse_features = (sparse_features, tuple(weights))
+
+    dataset = dataset_ops.DatasetV2.from_tensors(sparse_features)
+    # Data is batched to self.data_batch_size, rebatch to global batch size.
+    return dataset.unbatch().repeat().batch(
+        self.batch_size * strategy.num_replicas_in_sync, drop_remainder=True)
+
+  def _create_high_dimensional_ragged_dataset(self,
+                                              strategy,
+                                              include_weights=False,
+                                              weight=0.5):
+    ragged_features = (
+        ragged_tensor.RaggedTensor(
+            row_lengths=self.feature_watched_row_lengths_high_dimensional,
+            values=self.feature_watched_values_high_dimensional),
+        ragged_tensor.RaggedTensor(
+            row_lengths=self.feature_favorited_row_lengths_high_dimensional,
+            values=self.feature_favorited_values_high_dimensional),
+        ragged_tensor.RaggedTensor(
+            row_lengths=self.feature_friends_row_lengths_high_dimensional,
+            values=self.feature_friends_values_high_dimensional))
+    if include_weights:
+      weights = []
+      for ragged in ragged_features:
+        values = (
+            array_ops.ones_like(ragged.values, dtype=dtypes.float32) * weight)
+        weights.append(
+            ragged_tensor.RaggedTensor(
+                row_lengths=ragged.row_lengths(), values=values))
+      ragged_features = (ragged_features, tuple(weights))
+
+    dataset = dataset_ops.DatasetV2.from_tensors(ragged_features)
+    # Data is batched to self.data_batch_size, rebatch to global batch size.
+    return dataset.unbatch().repeat().batch(
+        self.batch_size * strategy.num_replicas_in_sync, drop_remainder=True)
+
   def _create_ragged_dataset(self, strategy, include_weights=False, weight=0.5):
     # Create dataset for enqueue operation
     sparse_features = self._create_sparse_data(include_weights, weight)
@@ -276,8 +393,39 @@ def input_fn(ctx):
 
     return input_fn
 
+  def _create_high_dimensional_dense_input_fn(self,
+                                              strategy,
+                                              include_weights=False,
+                                              weight=0.5):
+
+    def input_fn(ctx):
+      del ctx
+      dense_size = self.data_batch_size * 2
+      features = (
+          constant_op.constant(
+              self.feature_watched_values_high_dimensional[:dense_size],
+              shape=(self.data_batch_size, 2),
+              dtype=dtypes.int32),
+          constant_op.constant(
+              self.feature_favorited_values_high_dimensional[:dense_size],
+              shape=(self.data_batch_size, 2),
+              dtype=dtypes.int32),
+          constant_op.constant(
+              self.feature_friends_values_high_dimensional[:dense_size],
+              shape=(self.data_batch_size, 2),
+              dtype=dtypes.int32))
+      if include_weights:
+        weights = [
+            array_ops.ones_like(t, dtype=dtypes.float32) * weight
+            for t in features
+        ]
+        features = (features, tuple(weights))
+      return dataset_ops.DatasetV2.from_tensors(features).repeat()
+
+    return input_fn
+
   def _check_results(self, strategy, shard_out_val, training, input_data,
-                     table_to_variable, optimizer):
+                     table_to_variable, optimizer, is_high_dimensional):
     num_replicas = strategy.num_replicas_in_sync
 
     # Unpack the values `strategy.run()` returns.
@@ -305,16 +453,21 @@ def _check_results(self, strategy, shard_out_val, training, input_data,
     # activation_watched_gold0 and activation_favorited_gold0.
     # For favorited it is the same but in the opposite order.
     activation_watched_gold = np.concatenate(
-        (np.concatenate((np.expand_dims(activation_watched_gold0, axis=0),) *
-                        (num_replicas // 2)),
-         np.concatenate((np.expand_dims(activation_favorited_gold0, axis=0),) *
-                        (num_replicas // 2))),
-        axis=1).reshape([self.batch_size * num_replicas, 4])
+        (activation_watched_gold0, activation_favorited_gold0))
     activation_favorited_gold = np.concatenate(
-        (activation_watched_gold[self.batch_size:,],
-         activation_watched_gold[0:self.batch_size,]))
+        (activation_favorited_gold0, activation_watched_gold0))
     activation_friends_gold = np.concatenate(
-        (activation_friends_gold0,) * num_replicas)
+        (activation_friends_gold0, activation_friends_gold0))
+
+    if is_high_dimensional:
+      activation_watched_gold = np.stack([activation_watched_gold] *
+                                         self.data_batch_size)
+
+      activation_favorited_gold = np.stack([activation_favorited_gold] *
+                                           self.data_batch_size)
+
+      activation_friends_gold = np.stack([activation_friends_gold] *
+                                         self.data_batch_size)
 
     loss_gold = [loss_gold0] * num_replicas
 
@@ -329,8 +482,10 @@ def _check_results(self, strategy, shard_out_val, training, input_data,
         np.reshape(self.embedding_values, [8, 4]))
     embedding_table_user_before = np.copy(
         np.reshape(self.embedding_values, [16, 2]))
-
-    global_batch_size = self.batch_size * num_replicas
+    if is_high_dimensional:
+      global_batch_size = self.batch_size * self.data_batch_size * num_replicas
+    else:
+      global_batch_size = self.batch_size * num_replicas
     if training:
       gradient_wrt_watched_gold = (2 * activation_watched_gold /
                                    global_batch_size)
@@ -341,19 +496,20 @@ def _check_results(self, strategy, shard_out_val, training, input_data,
 
       # Calculate gradients wrt embedding tables.
       gradients_wrt_user = (
-          _compute_gradients_wrt_embedding_table(
-              global_batch_size, gradient_wrt_friends_gold,
-              embedding_table_user_before, input_data[2].indices.numpy(),
-              input_data[2].values.numpy(), self.table_user.combiner))
+          _compute_gradients_wrt_embedding_table(gradient_wrt_friends_gold,
+                                                 embedding_table_user_before,
+                                                 input_data[2].indices.numpy(),
+                                                 input_data[2].values.numpy(),
+                                                 self.table_user.combiner))
       gradients_wrt_video = (
           _compute_gradients_wrt_embedding_table(
-              global_batch_size, gradient_wrt_favorited_gold,
-              embedding_table_video_before, input_data[1].indices.numpy(),
-              input_data[1].values.numpy(), self.table_video.combiner) +
+              gradient_wrt_favorited_gold, embedding_table_video_before,
+              input_data[1].indices.numpy(), input_data[1].values.numpy(),
+              self.table_video.combiner) +
           _compute_gradients_wrt_embedding_table(
-              global_batch_size, gradient_wrt_watched_gold,
-              embedding_table_video_before, input_data[0].indices.numpy(),
-              input_data[0].values.numpy(), self.table_video.combiner))
+              gradient_wrt_watched_gold, embedding_table_video_before,
+              input_data[0].indices.numpy(), input_data[0].values.numpy(),
+              self.table_video.combiner))
 
       self._check_embedding_and_slot_variables(embedding_table_user_before,
                                                gradients_wrt_user,
@@ -460,10 +616,14 @@ def select_replica(x):
       return x[replica_id].numpy()
     return nest.map_structure(select_replica, structured)
 
-  def test_dense_lookup(self):
+  @parameterized.parameters([True, False])
+  def test_dense_lookup(self, is_high_dimensional):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
 
-    input_fn = self._create_dense_input_fn(strategy)
+    if is_high_dimensional:
+      input_fn = self._create_high_dimensional_dense_input_fn(strategy)
+    else:
+      input_fn = self._create_dense_input_fn(strategy)
     dist = strategy.distribute_datasets_from_function(
         input_fn,
         options=distribute_lib.InputOptions(experimental_fetch_to_device=False))
@@ -487,6 +647,15 @@ def step():
     golden = ((numpy_videos[self.feature_watched_values[-2:]],
                numpy_videos[self.feature_favorited_values[-2:]],
                numpy_users[self.feature_friends_values[-2:]]))
+    if is_high_dimensional:
+      dense_size = self.data_batch_size * 2
+      golden = ((numpy_videos[
+          self.feature_watched_values_high_dimensional[:dense_size]].reshape(
+              self.data_batch_size, 2, -1), numpy_videos[
+                  self.feature_favorited_values_high_dimensional[:dense_size]]
+                 .reshape(self.data_batch_size, 2, -1), numpy_users[
+                     self.feature_friends_values_high_dimensional[:dense_size]]
+                 .reshape(self.data_batch_size, 2, -1)))
     self.assertAllClose(shard0, golden)
 
   @parameterized.parameters([True, False])
@@ -581,17 +750,12 @@ def tpu_fn():
     self.assertAllClose(golden, after_update)
 
 
-def _compute_gradients_wrt_embedding_table(batch_size,
-                                           gradient_wrt_activation,
-                                           embedding_table,
-                                           feature_indices,
-                                           feature_values,
-                                           combiner,
-                                           max_sequence_length=0):
+def _compute_gradients_wrt_embedding_table(gradient_wrt_activation,
+                                           embedding_table, feature_indices,
+                                           feature_values, combiner):
   """Compute gradients wrt embedding_table.
 
   Args:
-    batch_size: `int`, batch size.
     gradient_wrt_activation: `np.array` with shape `batch_size` by
       embedding `dimension`.
     embedding_table: `np.array` with shape `vocabulary_size` by embedding
@@ -599,7 +763,6 @@ def _compute_gradients_wrt_embedding_table(batch_size,
     feature_indices: `indices` as used to construct `SparseTensor`.
     feature_values: `values` as used to construct `SparseTensor`.
     combiner: `String`, 'mean' or 'sum'.
-    max_sequence_length: If non-zero, a sequence feature with the given length.
 
   Returns:
     Gradients wrt `embedding_table`, an `np.array`s with shape
@@ -611,23 +774,17 @@ def _compute_gradients_wrt_embedding_table(batch_size,
   """
   if combiner not in ('mean', 'sum'):
     raise ValueError('`combiner` must be mean or sum; got {}.'.format(combiner))
-  grads = []
-  for i in range(batch_size):
-    grad = np.zeros_like(embedding_table)
-    count = 0
-    for (batch_i, seq_index), vocabulary_id in zip(feature_indices,
-                                                   feature_values):
-      if batch_i == i:
-        count += 1
-        if max_sequence_length > 0:
-          if seq_index < max_sequence_length:
-            grad[vocabulary_id, :] += gradient_wrt_activation[i, seq_index, :]
-        else:
-          grad[vocabulary_id, :] += gradient_wrt_activation[i, :]
-    if combiner == 'mean' and not max_sequence_length:
-      grad = grad / count
-    grads.append(grad)
-  return np.stack(grads)
+  grads_shape = gradient_wrt_activation.shape[:-1] + embedding_table.shape
+  grads = np.zeros(shape=grads_shape)
+  count = np.zeros(shape=grads_shape)
+  for feature_indice, vocabulary_id in zip(feature_indices, feature_values):
+    batch_index = tuple(feature_indice[:-1])
+    grads[batch_index][vocabulary_id] += gradient_wrt_activation[batch_index]
+    count[batch_index] += 1
+  count[count == 0] = 1
+  if combiner == 'mean':
+    grads = grads / count
+  return np.reshape(grads, (-1, *embedding_table.shape))
 
 
 def _unpack(strategy, per_replica_output):
@@ -642,21 +799,15 @@ def _get_total_loss_tensor(activations):
     losses.append(
         math_ops.reduce_mean(
             math_ops.reduce_sum(
-                gen_math_ops.squared_difference(activation, 0), 1)))
+                gen_math_ops.squared_difference(activation, 0), axis=-1)))
   total_loss = array_ops.expand_dims_v2(sum(losses), 0)
   return total_loss
 
 
 def _compute_loss(activation_watched, activation_favorited, activation_friends):
-  watched_loss = np.mean(np.sum(activation_watched**2, axis=1))
-  if len(activation_favorited.shape) == 2:
-    favorited_loss = np.mean(np.sum(activation_favorited**2, axis=1))
-  else:
-    favorited_loss = np.mean(np.sum(activation_favorited**2, axis=(1, 2)))
-  if len(activation_friends.shape) == 2:
-    friends_loss = np.mean(np.sum(activation_friends**2, axis=1))
-  else:
-    friends_loss = np.mean(np.sum(activation_friends**2, axis=(1, 2)))
+  watched_loss = np.mean(np.sum(activation_watched**2, axis=-1))
+  favorited_loss = np.mean(np.sum(activation_favorited**2, axis=-1))
+  friends_loss = np.mean(np.sum(activation_friends**2, axis=-1))
   loss = watched_loss + favorited_loss + friends_loss
   return loss
 
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_test.py b/tensorflow/python/tpu/tpu_embedding_v2_test.py
index d7249b9d993db2..d852f06d620ab8 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_test.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_test.py
@@ -36,6 +36,7 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework.tensor_shape import TensorShape
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_math_ops
@@ -458,7 +459,11 @@ def test_apply():
 
   def test_pass_none_to_apply_gradients(self):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-    mid_level_api.build(self.batch_size)
+    mid_level_api.build([
+        TensorShape((self.batch_size, 2)),
+        TensorShape((self.batch_size, 2)),
+        TensorShape((self.batch_size, 3))
+    ])
     dataset = self._create_sparse_dataset(strategy)
     data = next(
         iter(
@@ -655,9 +660,7 @@ def step():
       mid_level_api.enqueue(features, training=False)
       return strategy.run(step)
 
-    with self.assertRaisesRegex(
-        ValueError, 'Found both SparseTensors and RaggedTensors'):
-      test_fn()
+    test_fn()
 
   def test_enqueue_incorrect_structure_for_features(self):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
@@ -839,7 +842,11 @@ def test_enqueue_with_weights(self, ragged):
     else:
       dataset = self._create_sparse_dataset(strategy, include_weights=True,
                                             weight=weight)
-      mid_level_api.build(self.batch_size)
+      mid_level_api.build([
+          TensorShape((self.batch_size, 2)),
+          TensorShape((self.batch_size, 2)),
+          TensorShape((self.batch_size, 3))
+      ])
 
     dataset_iter = iter(
         strategy.experimental_distribute_dataset(
@@ -882,7 +889,11 @@ def test_enqueue_with_outside_compilation(self, use_mlir):
       config.enable_mlir_bridge()
 
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-    mid_level_api.build(self.batch_size)
+    mid_level_api.build([
+        TensorShape((self.batch_size, 2)),
+        TensorShape((self.batch_size, 2)),
+        TensorShape((self.batch_size, 3))
+    ])
     dataset = self._create_sparse_dataset(strategy)
     dataset_iter = iter(
         strategy.experimental_distribute_dataset(
@@ -949,7 +960,11 @@ def get_activations(features):
 
   def test_enqueue_with_outside_compilation_non_direct_input(self):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-    mid_level_api.build(self.batch_size)
+    mid_level_api.build([
+        TensorShape((self.batch_size, 2)),
+        TensorShape((self.batch_size, 2)),
+        TensorShape((self.batch_size, 3))
+    ])
     dataset = self._create_sparse_dataset(strategy)
     dataset_iter = iter(
         strategy.experimental_distribute_dataset(
@@ -973,7 +988,11 @@ def get_activations(features):
 
   def test_enqueue_with_outside_compilation_auto_mode(self):
     strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
-    mid_level_api.build(self.batch_size)
+    mid_level_api.build([
+        TensorShape((self.batch_size, 2)),
+        TensorShape((self.batch_size, 2)),
+        TensorShape((self.batch_size, 3))
+    ])
     dataset = self._create_sparse_dataset(strategy)
     dataset_iter = iter(
         strategy.experimental_distribute_dataset(
@@ -1121,13 +1140,16 @@ def _create_dense_input_fn(self, strategy, include_weights=False, weight=0.5):
 
     def input_fn(ctx):
       del ctx
-      features = (
-          constant_op.constant(self.feature_watched_values[-2:],
-                               dtype=dtypes.int32),
-          constant_op.constant(self.feature_favorited_values[-2:],
-                               dtype=dtypes.int32),
-          constant_op.constant(self.feature_friends_values[-2:],
-                               dtype=dtypes.int32))
+      features = (constant_op.constant(
+          self.feature_watched_values[-2:], shape=(2, 1), dtype=dtypes.int32),
+                  constant_op.constant(
+                      self.feature_favorited_values[-2:],
+                      shape=(2, 1),
+                      dtype=dtypes.int32),
+                  constant_op.constant(
+                      self.feature_friends_values[-2:],
+                      shape=(2, 1),
+                      dtype=dtypes.int32))
       if include_weights:
         weights = [array_ops.ones_like(t, dtype=dtypes.float32) * weight
                    for t in features]
@@ -1175,7 +1197,9 @@ def lr_function():
                   table=table_config, name='feature')},
           optimizer=optimizer)
 
-    feature = {'feature': constant_op.constant([0], dtype=dtypes.int32)}
+    feature = {
+        'feature': constant_op.constant([0], shape=(1, 1), dtype=dtypes.int32)
+    }
 
     def input_fn(ctx):
       del ctx
@@ -1314,14 +1338,14 @@ def tpu_embedding_config():
         mid_level_api = tpu_embedding_v2.TPUEmbedding(
             feature_config=feature_configs,
             optimizer=optimizer)
-      mid_level_api._batch_size = 128
+      mid_level_api._output_shapes = [TensorShape(128)] * len(feature_configs)
       return mid_level_api._create_config_proto()
 
     self.assertProtoEquals(tpu_embedding_config(), tpu_embedding_config())
 
   def test_multiple_creation(self):
-    feature_config = (tpu_embedding_v2_utils.FeatureConfig(
-        table=self.table_user, name='friends', max_sequence_length=2),)
+    feature_config = tpu_embedding_v2_utils.FeatureConfig(
+        table=self.table_user, name='friends', max_sequence_length=2)
     optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
     strategy = self._get_strategy()
     with strategy.scope():
@@ -1338,6 +1362,411 @@ def test_multiple_creation(self):
                                 'TPU is already initialized for embeddings.'):
       embedding_two.build(64)
 
+  @parameterized.parameters([True, False])
+  def test_sequence_feature(self, is_sparse):
+    seq_length = 3
+    # Set the max_seq_length in feature config
+    for feature in self.feature_config:
+      feature.max_sequence_length = seq_length
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    if is_sparse:
+      dataset = self._create_sparse_dataset(strategy)
+    else:
+      dataset = self._create_ragged_dataset(strategy)
+    feature_iter = iter(
+        strategy.experimental_distribute_dataset(
+            dataset,
+            options=distribute_lib.InputOptions(
+                experimental_fetch_to_device=False)))
+
+    @def_function.function
+    def test_fn():
+
+      def step():
+        return mid_level_api.dequeue()
+
+      mid_level_api.enqueue(next(feature_iter), training=False)
+      return strategy.run(step)
+
+    output = test_fn()
+    self.assertEqual(
+        self._get_replica_numpy(output[0], strategy, 0).shape, (2, 3, 4))
+    self.assertEqual(
+        self._get_replica_numpy(output[1], strategy, 0).shape, (2, 3, 4))
+    self.assertEqual(
+        self._get_replica_numpy(output[2], strategy, 0).shape, (2, 3, 2))
+
+
+class TPUEmbeddingHighDimensionalTensorTest(parameterized.TestCase,
+                                            test.TestCase):
+
+  def setUp(self):
+    super(TPUEmbeddingHighDimensionalTensorTest, self).setUp()
+    self.embedding_values = np.array(list(range(32)), dtype=np.float64)
+    self.initializer = init_ops_v2.Constant(self.embedding_values)
+    # Embedding for video initialized to
+    # 0 1 2 3
+    # 4 5 6 7
+    # ...
+    self.table_video = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=8,
+        dim=4,
+        initializer=self.initializer,
+        combiner='sum',
+        name='video')
+    # Embedding for user initialized to
+    # 0 1
+    # 2 3
+    # 4 5
+    # 6 7
+    # ...
+    self.table_user = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=16,
+        dim=2,
+        initializer=self.initializer,
+        combiner='mean',
+        name='user')
+    self.feature_config = (tpu_embedding_v2_utils.FeatureConfig(
+        table=self.table_video, name='watched'),
+                           tpu_embedding_v2_utils.FeatureConfig(
+                               table=self.table_video, name='favorited'),
+                           tpu_embedding_v2_utils.FeatureConfig(
+                               table=self.table_user, name='friends'))
+
+    self.batch_size = 2
+    self.seq_length = 4
+    self.data_batch_size = 2
+
+    # One (global) batch of inputs
+    # ３D sparse tensor for watched:
+    # sample 0: row 0: 0
+    #           row 1: 0, 1
+    #           row 2: 0, 1
+    #           row 3: 1
+    # sample 1: row 0: 0
+    #           row 1: 0, 1
+    #           row 2: 0, 1
+    #           row 3: 1
+    # Shape: (2, 4, 2)
+    self.feature_watched_indices = [[0, 0, 0], [0, 1, 0], [0, 1, 1], [0, 2, 0],
+                                    [0, 2, 1], [0, 3, 0], [1, 0, 0], [1, 1, 0],
+                                    [1, 1, 1], [1, 2, 0], [1, 2, 1], [1, 3, 0]]
+    self.feature_watched_values = [0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1]
+    self.feature_watched_row_lengths = [1, 2, 2, 1, 1, 2, 2, 1]
+    # 3D sparse tensor for favorited:
+    # sample 0: row 0: 0, 1
+    #           row 1: 1
+    #           row 2: 0
+    #           row 3: 0, 1
+    # sample 1: row 0: 0, 1
+    #           row 1: 1
+    #           row 2: 0
+    #           row 3: 0, 1
+    # Shape: (2, 4, 2)
+    self.feature_favorited_indices = [[0, 0, 0], [0, 0, 1], [0, 1,
+                                                             0], [0, 2, 0],
+                                      [0, 3, 0], [0, 3, 1], [1, 0,
+                                                             0], [1, 0, 1],
+                                      [1, 1, 0], [1, 2, 0], [1, 3, 0],
+                                      [1, 3, 1]]
+    self.feature_favorited_values = [0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1]
+    self.feature_favorited_row_lengths = [2, 1, 1, 2, 2, 1, 1, 2]
+    # 3D sparse tensor for friends:
+    # sample 0: row 0: 3
+    #           row 1: 0, 1, 2
+    #           row 2: 3
+    #           row 3: 0, 1, 2
+    # sample 1: row 0: 3
+    #           row 1: 0, 1, 2
+    #           row 2: 3
+    #           row 3: 0, 1, 2
+    # Shape: (2, 4, 3)
+    self.feature_friends_indices = [[0, 0, 0], [0, 1, 0], [0, 1, 1], [0, 1, 2],
+                                    [0, 2, 0], [0, 3, 0], [0, 3, 1], [0, 3, 2],
+                                    [1, 0, 0], [1, 1, 0], [1, 1, 1], [1, 1, 2],
+                                    [1, 2, 0], [1, 3, 0], [1, 3, 1], [1, 3, 2]]
+    self.feature_friends_values = [
+        3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2
+    ]
+    self.feature_friends_row_lengths = [1, 3, 1, 3, 1, 3, 1, 3]
+    self.resolver = None
+
+  def _get_strategy(self):
+    self.resolver = tpu_cluster_resolver.TPUClusterResolver(
+        tpu=FLAGS.tpu, zone=FLAGS.zone, project=FLAGS.project)
+    remote.connect_to_cluster(self.resolver)
+    tpu_strategy_util.initialize_tpu_system(self.resolver)
+    strategy = tpu_strategy.TPUStrategy(self.resolver)
+    self.num_replicas = strategy.num_replicas_in_sync
+    return strategy
+
+  def _create_strategy_and_mid_level(self, optimizer_name):
+    strategy = self._get_strategy()
+
+    with strategy.scope():
+      if optimizer_name == 'sgd':
+        optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
+      elif optimizer_name == 'adagrad':
+        optimizer = tpu_embedding_v2_utils.Adagrad(learning_rate=0.1)
+      elif optimizer_name == 'adam':
+        optimizer = tpu_embedding_v2_utils.Adam(learning_rate=0.1)
+      elif optimizer_name == 'ftrl':
+        optimizer = tpu_embedding_v2_utils.FTRL(learning_rate=0.1)
+      else:
+        raise ValueError('optimizer is not recognized: ', optimizer_name)
+      mid_level_api = self._create_mid_level(optimizer=optimizer)
+
+    return strategy, mid_level_api, optimizer
+
+  def _create_mid_level(self, optimizer=None):
+    # Create `TPUEmbedding` object.
+    if optimizer is None:
+      optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
+
+    return tpu_embedding_v2.TPUEmbedding(
+        feature_config=self.feature_config, optimizer=optimizer)
+
+  def _create_sparse_dataset(self, strategy, include_weights=False, weight=0.5):
+    # Create dataset for enqueue operation
+    sparse_features = (sparse_tensor.SparseTensor(
+        indices=self.feature_watched_indices,
+        values=self.feature_watched_values,
+        dense_shape=[self.data_batch_size, self.seq_length, 2]),
+                       sparse_tensor.SparseTensor(
+                           indices=self.feature_favorited_indices,
+                           values=self.feature_favorited_values,
+                           dense_shape=[
+                               self.data_batch_size, self.seq_length, 2
+                           ]),
+                       sparse_tensor.SparseTensor(
+                           indices=self.feature_friends_indices,
+                           values=self.feature_friends_values,
+                           dense_shape=[
+                               self.data_batch_size, self.seq_length, 3
+                           ]))
+    if include_weights:
+      weights = []
+      for sparse in sparse_features:
+        values = (
+            array_ops.ones_like(sparse.values, dtype=dtypes.float32) * weight)
+        weights.append(
+            sparse_tensor.SparseTensor(
+                indices=sparse.indices,
+                values=values,
+                dense_shape=sparse.dense_shape))
+      sparse_features = (sparse_features, tuple(weights))
+
+    dataset = dataset_ops.DatasetV2.from_tensors(sparse_features)
+
+    # Data is batched to self.data_batch_size, rebatch to global batch size.
+    return dataset.unbatch().repeat().batch(
+        self.batch_size * strategy.num_replicas_in_sync, drop_remainder=True)
+
+  def _create_ragged_dataset(self, strategy, include_weights=False, weight=0.5):
+    # Create dataset for enqueue operation
+    ragged_features = (ragged_tensor.RaggedTensor.from_row_lengths(
+        row_lengths=self.feature_watched_row_lengths,
+        values=self.feature_watched_values),
+                       ragged_tensor.RaggedTensor.from_row_lengths(
+                           row_lengths=self.feature_favorited_row_lengths,
+                           values=self.feature_favorited_values),
+                       ragged_tensor.RaggedTensor.from_row_lengths(
+                           row_lengths=self.feature_friends_row_lengths,
+                           values=self.feature_friends_values))
+    if include_weights:
+      weights = []
+      for ragged in ragged_features:
+        weights.append(
+            ragged.with_values(
+                array_ops.ones_like(ragged.values, dtype=dtypes.float32) *
+                weight))
+      ragged_features = (ragged_features, tuple(weights))
+
+    dataset = dataset_ops.DatasetV2.from_tensors(ragged_features)
+
+    # Data is batched to self.data_batch_size, rebatch to global batch size.
+    return dataset.unbatch().repeat().batch(
+        self.batch_size * strategy.num_replicas_in_sync, drop_remainder=True)
+
+  def _create_dense_dataset(self, strategy, include_weights=False, weight=0.5):
+    features = (constant_op.constant(
+        np.zeros(self.data_batch_size * self.seq_length),
+        shape=(self.data_batch_size, self.seq_length, 1),
+        dtype=dtypes.int32),
+                constant_op.constant(
+                    np.ones(self.data_batch_size * self.seq_length),
+                    shape=(self.data_batch_size, self.seq_length, 1),
+                    dtype=dtypes.int32),
+                constant_op.constant(
+                    np.zeros(self.data_batch_size * self.seq_length),
+                    shape=(self.data_batch_size, self.seq_length, 1),
+                    dtype=dtypes.int32))
+    if include_weights:
+      weights = [
+          array_ops.ones_like(t, dtype=dtypes.float32) * weight
+          for t in features
+      ]
+      features = (features, tuple(weights))
+    dataset = dataset_ops.DatasetV2.from_tensors(features)
+    return dataset.unbatch().repeat().batch(
+        self.batch_size * strategy.num_replicas_in_sync, drop_remainder=True)
+
+  def test_enqueue_dense_sparse_ragged(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    dense = self._create_dense_dataset(strategy)
+    dense_iter = iter(
+        strategy.experimental_distribute_dataset(
+            dense,
+            options=distribute_lib.InputOptions(
+                experimental_fetch_to_device=False)))
+
+    sparse = self._create_sparse_dataset(strategy)
+    sparse_iter = iter(
+        strategy.experimental_distribute_dataset(
+            sparse,
+            options=distribute_lib.InputOptions(
+                experimental_fetch_to_device=False)))
+
+    ragged = self._create_ragged_dataset(strategy)
+    ragged_iter = iter(
+        strategy.experimental_distribute_dataset(
+            ragged,
+            options=distribute_lib.InputOptions(
+                experimental_fetch_to_device=False)))
+
+    mid_level_api.build({
+        'watched': TensorShape([self.batch_size, self.seq_length, 1]),
+        'favorite': TensorShape([self.batch_size, self.seq_length, 2]),
+        'friends': TensorShape([self.batch_size, self.seq_length, None])
+    })
+
+    @def_function.function
+    def test_fn():
+
+      def step():
+        return mid_level_api.dequeue()
+
+      features = (next(dense_iter)[0], next(sparse_iter)[1],
+                  next(ragged_iter)[2])
+      mid_level_api.enqueue(features, training=False)
+      return strategy.run(step)
+
+    test_fn()
+
+  def test_different_input_shapes(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    sparse = self._create_sparse_dataset(strategy)
+    sparse_iter = iter(
+        strategy.experimental_distribute_dataset(
+            sparse,
+            options=distribute_lib.InputOptions(
+                experimental_fetch_to_device=False)))
+    # Create a feature with shape (1, 3, 1)
+    dense_feature = constant_op.constant(
+        np.zeros(3), shape=(1, 3, 1), dtype=dtypes.int32)
+    dense_dataset = dataset_ops.DatasetV2.from_tensors(
+        dense_feature).unbatch().repeat().batch(
+            1 * strategy.num_replicas_in_sync, drop_remainder=True)
+    dense_iter = iter(
+        strategy.experimental_distribute_dataset(
+            dense_dataset,
+            options=distribute_lib.InputOptions(
+                experimental_fetch_to_device=False)))
+
+    @def_function.function
+    def test_fn():
+
+      def step():
+        return mid_level_api.dequeue()
+
+      features = (next(dense_iter), next(sparse_iter)[1], next(sparse_iter)[2])
+      mid_level_api.enqueue(features, training=False)
+      return strategy.run(step)
+
+    test_fn()
+
+    self.assertEqual(
+        mid_level_api._output_shapes,
+        [TensorShape((1, 3)),
+         TensorShape((2, 4)),
+         TensorShape((2, 4))])
+    # The GCD of batch size 1 and 2 should be 1.
+    self.assertEqual(mid_level_api._config_proto.batch_size_per_tensor_core, 1)
+
+  def test_build_incorrect_output_shapes(self):
+    _, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+    # Output shapes is set in the mid_level_api, but build with incorrect output
+    # shapes.
+    mid_level_api._output_shapes = [TensorShape((2, 4)) for _ in range(3)]
+
+    with self.assertRaisesRegex(ValueError,
+                                'Inconsistent shape founded for input feature'):
+      mid_level_api.build([TensorShape([1, 1, 1]) for _ in range(3)])
+
+  def test_enqueue_incorrect_shape_feature(self):
+    strategy, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    sparse = self._create_sparse_dataset(strategy)
+    sparse_iter = iter(
+        strategy.experimental_distribute_dataset(
+            sparse,
+            options=distribute_lib.InputOptions(
+                experimental_fetch_to_device=False)))
+
+    mid_level_api._output_shapes = [TensorShape((1, 1)) for _ in range(3)]
+    # The output shape passed to build method is consistent.
+    mid_level_api.build([TensorShape([1, 1, 1]) for _ in range(3)])
+
+    @def_function.function
+    def test_fn():
+
+      def step():
+        return mid_level_api.dequeue()
+
+      mid_level_api.enqueue(next(sparse_iter), training=False)
+      return strategy.run(step)
+
+    # Enqueued tensor has shape inconsistent with the output shape setting.
+    with self.assertRaisesRegex(ValueError,
+                                'Inconsistent shape founded for input feature'):
+      test_fn()
+
+  def test_not_fully_defined_output_shapes_in_feature_config(self):
+    _, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    # Feature config sets undefined output shapes
+    mid_level_api._output_shapes = [TensorShape(None) for _ in range(3)]
+    with self.assertRaisesRegex(ValueError, 'Input Feature'):
+      mid_level_api.build()
+
+  def test_not_fully_defined_output_shapes_for_build(self):
+    _, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    # Build with undefined output shape
+    with self.assertRaisesRegex(ValueError, 'Input Feature'):
+      mid_level_api.build([TensorShape([1, None, None]) for _ in range(3)])
+
+  def test_output_shapes_priority_over_feature_config_and_build(self):
+    _, mid_level_api, _ = self._create_strategy_and_mid_level('sgd')
+
+    # The output shapes setting in the feature config has the first priority.
+    mid_level_api._output_shapes = [TensorShape((2, 4)) for _ in range(3)]
+    mid_level_api.build([TensorShape((2, None, None)) for _ in range(3)])
+    self.assertEqual(mid_level_api._output_shapes,
+                     [TensorShape((2, 4)) for _ in range(3)])
+
+  def _get_replica_numpy(self, structured, strategy, replica_id):
+
+    def select_replica(x):
+      x = strategy.experimental_local_results(x)
+      if len(x) == 1:
+        return x.numpy()
+      return x[replica_id].numpy()
+
+    return nest.map_structure(select_replica, structured)
+
 
 def _unpack(strategy, per_replica_output):
   per_replica_output = strategy.experimental_local_results(per_replica_output)
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_utils.py b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
index 5b609a0c218bda..f763ce123dfc55 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_utils.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
@@ -26,6 +26,7 @@
 from tensorflow.core.protobuf.tpu import tpu_embedding_configuration_pb2
 from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.framework import ops
+from tensorflow.python.framework.tensor_shape import TensorShape
 from tensorflow.python.ops import init_ops_v2
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.tpu.ops import tpu_ops
@@ -847,6 +848,15 @@ class FeatureConfig(object):
   features will be looked up in the first table and the third feature will be
   looked up in the second table.
 
+  You can also specify the output shape for each feature. The output shape
+  should be the expected activation shape excluding the table dimension. For
+  dense and sparse tensor, the output shape should be the same as the input
+  shape excluding the last dimension. For ragged tensor, the output shape can
+  mismatch the input shape.
+
+  NOTE: The `max_sequence_length` will be only used when the input tensor has
+  rank 2 and the `output_shape` is not set in the feature config.
+
   When feeding features into `embedding.enqueue` they can be `tf.Tensor`s,
   `tf.SparseTensor`s or `tf.RaggedTensor`s. When the argument
   `max_sequence_length` is 0, the default, you should expect a output of
@@ -860,6 +870,7 @@ def __init__(self,
                table: TableConfig,
                max_sequence_length: int = 0,
                validate_weights_and_indices: bool = True,
+               output_shape: Optional[Union[List[int], TensorShape]] = None,
                name: Optional[Text] = None):
     """Feature configuration.
 
@@ -870,9 +881,14 @@ def __init__(self,
         the corresponding maximum sequence length. If the sequence is longer
         than this, it will be truncated. If 0, the feature is not a sequence
         feature.
-      validate_weights_and_indices: If true, uses safe_embedding_lookup
-        during serving which ensures there are no empty rows and all weights
-        and ids are positive at the expense of extra compute cost.
+      validate_weights_and_indices: If true, uses safe_embedding_lookup during
+        serving which ensures there are no empty rows and all weights and ids
+        are positive at the expense of extra compute cost.
+      output_shape: Optional argument to config the output shape of the feature
+        activation. If provided, the feature feeding to the `embedding.enqueue`
+        has to match the shape (for ragged tensor, the input shape and output
+        shape can mismatch). If not provided, the shape can be either provided
+        to the `embedding.build` or auto detected at the runtime.
       name: An optional name for the feature, useful for debugging.
 
     Returns:
@@ -895,6 +911,7 @@ def __init__(self,
     self.table = table
     self.max_sequence_length = max_sequence_length
     self.name = name
+    self.output_shape = TensorShape(output_shape)
 
     if not isinstance(
         validate_weights_and_indices, bool):
diff --git a/tensorflow/python/training/experimental/BUILD b/tensorflow/python/training/experimental/BUILD
index b1ac31f8317b8a..b4a6a4cb189dac 100644
--- a/tensorflow/python/training/experimental/BUILD
+++ b/tensorflow/python/training/experimental/BUILD
@@ -113,37 +113,3 @@ cuda_py_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
-
-py_library(
-    name = "loss_scaling_gradient_tape",
-    srcs = ["loss_scaling_gradient_tape.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":loss_scale",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:unconnected_gradients",
-        "//tensorflow/python:util",
-        "//tensorflow/python/distribute:distribute_lib",
-        "//tensorflow/python/eager:backprop",
-    ],
-)
-
-cuda_py_test(
-    name = "loss_scaling_gradient_tape_test",
-    size = "medium",
-    srcs = ["loss_scaling_gradient_tape_test.py"],
-    shard_count = 2,
-    tags = ["no_oss"],
-    deps = [
-        ":loss_scale",
-        ":loss_scaling_gradient_tape",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_test_combinations_lib",
-        "//tensorflow/python/compat:v2_compat",
-        "//tensorflow/python/distribute:mirrored_strategy",
-        "//tensorflow/python/eager:def_function",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
diff --git a/tensorflow/python/training/experimental/loss_scaling_gradient_tape.py b/tensorflow/python/training/experimental/loss_scaling_gradient_tape.py
deleted file mode 100644
index 16a98407098921..00000000000000
--- a/tensorflow/python/training/experimental/loss_scaling_gradient_tape.py
+++ /dev/null
@@ -1,316 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Contains Loss Scale Gradient Tape."""
-
-from tensorflow.python.distribute import distribution_strategy_context
-from tensorflow.python.eager import backprop
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
-from tensorflow.python.training.experimental import loss_scale as loss_scale_module
-from tensorflow.python.util import nest
-
-
-def _convert_to_per_replicas(distribution, values):
-  """Converts tensors and DistributedVariables to PerReplica values.
-
-  Args:
-    distribution: The distribution strategy in effect.
-    values: A list of tensors, variables, DistributedValues, or anything else
-      that can be converted to a PerReplcia value
-
-  Returns:
-    `values`, but each element has been converted to a PerReplica value.
-  """
-  return distribution.run(
-      lambda values: [array_ops.identity(v) for v in values],
-      args=(values,)
-  )
-
-
-# TODO(reedwm): Expose this after testing it on several models.
-class LossScaleGradientTape(backprop.GradientTape):
-  """A gradient tape that scales losses and unscales resulting gradients.
-
-  Operates as a normal gradient tape, but takes in a
-  `tf.mixed_precision.experimental.LossScale` object. Losses are scaled up by
-  some amount before the gradients are calculated and the resulting gradients
-  are scaled down by the same amount.
-
-  This has no net mathematical effect, but can be used to prevent vanishing
-  gradients, for example in the case of mixed precision training.
-
-  If a DynamicLossScale object is used and non-finite gradients are encountered,
-  the loss scale will be updated and the gradients recomputed until either
-  finite gradients are encountered or the loss scale becomes 1.
-
-  This class should *not* be used with a LossScaleOptimizer, as both classes
-  update the LossScale object. Use a non-loss scaling optimizer instead.
-
-  Usage:
-  ```
-  opt = tf.keras.optimizers.SGD(1.0)
-  model_loss_scale = tf.mixed_precision.experimental.DynamicLossScale()
-
-  for step in training_steps:
-    with LossScaleGradientTape(model_loss_scale) as tape:
-      logits = ...  # Run model and get logits
-      loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits,
-                                                     labels=labels)
-      loss = tf.reduce_mean(loss)
-    vars = tape.watched_variables()
-    grads = tape.gradient(loss, vars)
-    opt.apply_gradients(zip(grads, vars))
-  ```
-
-  WARNING: Computing second-order (or higher) gradients with a
-  `LossScaleGradientTape` does not yet work properly when a
-  `tf.distribute.Strategy` is used. Computing second-order gradients will return
-  None instead of the gradient tensors. This only occurs when you nest multiple
-  gradient tapes under each other; if you do not nest them, this issue will not
-  occur.
-  """
-
-  def __init__(self,
-               loss_scale,
-               persistent=False,
-               watch_accessed_variables=True):
-    """Creates a new LossScaleGradientTape.
-
-    Args:
-      loss_scale: `tf.mixed_precision.experimental.LossScale` object that
-        manages what quantity to scale by. This is typically either a
-        FixedLossScale object with a constant scalar or a
-        `tf.mixed_precision.experimental.DynamicLossScale` object that will
-        adjust the scalar appropriately if any non-finite gradients are
-        encountered.
-      persistent: Boolean controlling whether a persistent gradient tape is
-        created. False by default, which means at most one call can be made to
-        the gradient() method on this object.
-      watch_accessed_variables: Boolean controlling whether the tape will
-        automatically `watch` any (trainable) variables accessed while the tape
-        is active. Defaults to True meaning gradients can be requested from any
-        result computed in the tape derived from reading a trainable `Variable`.
-        If False users must explicitly `watch` any `Variable`s they want to
-        request gradients from.
-    """
-    if not isinstance(loss_scale, loss_scale_module.LossScale):
-      raise ValueError("`loss_scale` must be an instance of LossScale, "
-                       "but got: %s" % (loss_scale,))
-    if not ops.executing_eagerly_outside_functions():
-      raise ValueError("LossScaleGradientTape is only supported in Eager mode.")
-
-    # always make a persistent tape to loop over loss scaling
-    super(LossScaleGradientTape, self).__init__(True,
-                                                watch_accessed_variables)
-    self._outer_persistent = persistent
-    self._loss_scale = loss_scale
-
-  def gradient(self,
-               target,
-               sources,
-               output_gradients=None,
-               unconnected_gradients=UnconnectedGradients.NONE):
-    """Computes the gradient using operations recorded in context of this tape.
-
-    Uses the `LossScale` object provided in the constructor to scale `target`
-    and then to unscale the resulting gradients.
-
-    Args:
-      target: a list or nested structure of Tensors or Variables to be
-        differentiated.
-      sources: a list or nested structure of Tensors or Variables. `target` will
-        be differentiated against elements in `sources`.
-      output_gradients: a list of gradients, one for each element of target.
-        Defaults to None.
-      unconnected_gradients: a value which can either hold 'none' or 'zero' and
-        alters the value which will be returned if the target and sources are
-        unconnected. The possible values and effects are detailed in
-        'UnconnectedGradients' and it defaults to 'none'.
-
-    Returns:
-      a list or nested structure of Tensors (or IndexedSlices, or None),
-      one for each element in `sources`. Returned structure is the same as
-      the structure of `sources`. If non-finite gradients are encountered
-      after dynamic scaling, the loss scale will be updated and the gradients
-      recomputed until either finite gradients are encountered or the loss scale
-      becomes 1.
-
-    Raises:
-      RuntimeError: if called inside the context of the tape, or if called more
-       than once on a non-persistent tape.
-      ValueError: if the target is a variable or if unconnected gradients is
-       called with an unknown value.
-    """
-    if self._tape is None:  # pylint: disable=access-member-before-definition
-      raise RuntimeError("GradientTape.gradient can only be called once on "
-                         "non-persistent tapes.")
-    if distribution_strategy_context.in_cross_replica_context():
-      raise ValueError("LossScaleGradientTape.gradient() must be called in a "
-                       "replica context.")
-
-    # Note: DistributionStrategy does not support running a while loop in a
-    # replica context. So, we call `_compute_gradients_until_finite` in a cross-
-    # replica context.
-    replica_context = distribution_strategy_context.get_replica_context()
-    grads = replica_context.merge_call(
-        _compute_gradients_until_finite,
-        args=(self, self._loss_scale, target, sources, output_gradients,
-              unconnected_gradients))
-
-    if not self._outer_persistent:
-      self._tape = None  # free up resources if a persistent tape was not needed
-    return grads
-
-  def jacobian(self,
-               target,
-               sources,
-               unconnected_gradients=UnconnectedGradients.NONE,
-               parallel_iterations=None,
-               experimental_use_pfor=True):
-    # TODO(reedwm): Implement this
-    raise NotImplementedError("LossScaleGradientTape.jacobian is not "
-                              "yet implemented")
-
-  def batch_jacobian(self,
-                     target,
-                     source,
-                     unconnected_gradients=UnconnectedGradients.NONE,
-                     parallel_iterations=None,
-                     experimental_use_pfor=True):
-    # TODO(reedwm): Implement this
-    raise NotImplementedError("LossScaleGradientTape.batch_jacobian is not "
-                              "yet implemented")
-
-
-def _compute_gradients_until_finite(
-    distribution, loss_scale_gradient_tapes, loss_scale, target, sources,
-    output_gradients, unconnected_gradients):
-  """Compute gradients and update the loss scale until the gradients are finite.
-
-  This must be called in a cross-replica context.
-
-  This is a function instead of a method of LossScaleGradientTape, as the `self`
-  parameter would be meaningless. There is one LossScaleGradientTape per
-  replica, but this function is called once total (not per replica), so there
-  cannot be a singular `self` parameter.
-
-  Args:
-    distribution: The distribution strategy in effect.
-    loss_scale_gradient_tapes: A PerReplica value of LossScaleGradientTapes.
-      Contains the LossScaleGradientTape of each replica.
-    loss_scale: The loss scale to use to scale the loss and unscale the
-      gradient.
-    target: a list or nested structure of Tensors or Variables to be
-      differentiated.
-    sources: a list or nested structure of Tensors or Variables. `target` will
-      be differentiated against elements in `sources`.
-    output_gradients: Passed to GradientTape.gradient
-    unconnected_gradients: Pass to GradientTape.gradient.
-
-  Returns:
-    The gradients of `target` with respect to `sources`.
-  """
-  # Autograph cannot convert this function, so we must use an explicit
-  # tf.while_loop.
-  # TODO(b/143572314): Fix Autograph so that it can convert this function, then
-  # replace the tf.while_loop with a Python while loop.
-
-  # For convenience, we only deal with flattened sources
-  flattened_sources = nest.flatten(sources)
-
-  # Define the initial loop variables of the while loop.
-
-  # Dummy value for initial_grads. The first iteration of the loop will
-  # overwrite `grads` to the actual gradients.
-  initial_grads = flattened_sources
-  if distribution_strategy_context.has_strategy():
-    # A while_loop requires the initial values to have the same types as the
-    # return values from the body. However, 'initial_grads' may have type
-    # 'DistributionVariable', while body returns a 'PerReplica'. While both
-    # types subclass 'DistributedValues', while_loop will still throw an error.
-    # So we convert 'initial_grads' to be PerReplica values.
-    # TODO(b/146084534): Once the bug is fixed, remove this special case.
-    initial_grads = _convert_to_per_replicas(distribution, initial_grads)
-  initial_ready_to_update = False
-  initial_is_first_iteration = True
-
-  def cond(grads, ready_to_update, is_first_iteration):
-    """The condition of the while loop."""
-    del grads
-    # Equivalent to:
-    # `is_first_iteration or (not ready_to_update and loss_scale() > 1)`
-    return math_ops.logical_or(
-        is_first_iteration,
-        math_ops.logical_and(
-            math_ops.logical_not(ready_to_update),
-            math_ops.greater(loss_scale(), 1)))
-
-  # Boolean list specifying whether each gradient is None or not. Set by body().
-  is_nones = []
-
-  def body(grads, ready_to_update, is_first_iteration):
-    """The body of the while loop."""
-    del grads, ready_to_update, is_first_iteration
-    def replica_fn(gradient_tape, target, flattened_sources, output_gradients,
-                   initial_grads):
-      """Scales the loss, computes the gradients, and unscales the gradients."""
-      loss_scale_val = loss_scale()
-      with gradient_tape:  # re-enter gradient tape so it sees the loss scaling
-        scaled_target = nest.map_structure(
-            lambda t: t * math_ops.cast(loss_scale_val, t.dtype), target)
-      scaled_grads = super(LossScaleGradientTape, gradient_tape).gradient(
-          scaled_target, flattened_sources, output_gradients,
-          unconnected_gradients)
-
-      is_nones[:] = [g is None for g in scaled_grads]
-      inv_loss_scale = 1.0 / loss_scale_val
-      grads = []  # The unscaled gradients
-      for g, initial_grad in zip(scaled_grads, initial_grads):
-        if g is not None:
-          # We call ensure_shape as shape information can be lost for certain
-          # ops, such as tf.transpose, if the op is called in a tf.function and
-          # has inputs created outside the tf.function.
-          # TODO(b/132092188): Remove ensure_shape call after this has been
-          # fixed.
-          g = array_ops.ensure_shape(g, initial_grad.shape)
-          grads.append(g * math_ops.cast(inv_loss_scale, g.dtype))
-        else:
-          # We cannot return None from a tf.while_loop, so we pass a dummy
-          # tensor instead. We use initial_grad as a dummy tensor as it has the
-          # correct shape and dtype. We replace it with None outside the while
-          # loop.
-          grads.append(initial_grad)
-      return grads
-
-    # Switch to a replica-context to compute gradients once per replica.
-    grads = distribution.run(
-        replica_fn,
-        args=(loss_scale_gradient_tapes, target, flattened_sources,
-              output_gradients, initial_grads))
-    # Check for non-finite gradients possibly resulting from scaling
-    _, ready_to_update = loss_scale.update(grads)
-    is_first_iteration = False
-    return grads, ready_to_update, is_first_iteration
-
-  grads, _, _ = control_flow_ops.while_loop(
-      cond, body, [initial_grads, initial_ready_to_update,
-                   initial_is_first_iteration],
-      )
-  grads = [None if is_none else g for g, is_none in zip(grads, is_nones)]
-  grads = nest.pack_sequence_as(sources, grads)
-  return grads
diff --git a/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py b/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py
deleted file mode 100644
index 65e6141db5f5ad..00000000000000
--- a/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py
+++ /dev/null
@@ -1,537 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for lsgt.LossScaleGradientTape."""
-from absl.testing import parameterized
-import numpy as np
-from tensorflow.python.compat import v2_compat
-from tensorflow.python.distribute import distribution_strategy_context
-from tensorflow.python.distribute import mirrored_strategy
-from tensorflow.python.distribute import values
-from tensorflow.python.eager import context
-from tensorflow.python.eager import def_function
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import test_combinations
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-from tensorflow.python.training.experimental import loss_scale as loss_scale_module
-from tensorflow.python.training.experimental import loss_scaling_gradient_tape as lsgt
-from tensorflow.python.util import nest
-
-
-# If called outside any strategy.scope() calls, this will return the default
-# strategy.
-default_strategy_fn = distribution_strategy_context.get_strategy
-
-
-def create_mirrored_strategy():
-  if context.num_gpus() >= 1:
-    return mirrored_strategy.MirroredStrategy(['cpu:0', 'gpu:0'])
-  else:
-    return mirrored_strategy.MirroredStrategy(['cpu:0'])
-
-
-class LossScaleGradientTapeTest(test.TestCase, parameterized.TestCase):
-
-  def _run_with_strategy(self, run_fn, strategy, use_tf_function=False):
-    """Runs `run_fn` under the DistributionStrategy `strategy`.
-
-    Runs `run_fn` with `strategy.run`. Returns a list of the
-    return values of `run_fn`, one per replica.
-
-    Args:
-      run_fn: The function to run.
-      strategy: The DistributionStrategy to run `run_fn` with.
-      use_tf_function: If True, call `run_fn` under a tf.function.
-
-    Returns:
-      A list of tensors, each being the return value of `run_fn` from one
-      replica. If a nested structure is returned from `run_fn`, returns a
-      nested structure, where each element is a list of tensors.
-    """
-    strategy_fn = lambda: strategy.run(run_fn)
-    if use_tf_function:
-      strategy_fn = def_function.function(strategy_fn)
-
-    results = strategy_fn()
-
-    def convert_tensor_to_list(tensor):
-      if isinstance(tensor, values.DistributedValues):
-        return strategy.experimental_local_results(tensor)
-      else:
-        return [tensor]
-    return nest.map_structure(convert_tensor_to_list, results)
-
-  @test_combinations.generate(test_combinations.combine(
-      loss_scale=[loss_scale_module.FixedLossScale,
-                  loss_scale_module.DynamicLossScale],
-      strategy_fn=[default_strategy_fn, create_mirrored_strategy],
-      use_tf_function=[True, False]
-  ))
-  def test_basic_tapes(self, loss_scale, strategy_fn, use_tf_function):
-    loss_scale = loss_scale(32)
-    strategy = strategy_fn()
-    with strategy.scope():
-      x = variables.Variable(3.0)
-    def run_fn():
-      with lsgt.LossScaleGradientTape(loss_scale) as g:
-        y = x * x
-      return g.gradient(y, x)
-    dy_dx_list = self._run_with_strategy(run_fn, strategy, use_tf_function)
-    self.assertEqual(loss_scale(), 32)
-    for dy_dx in dy_dx_list:
-      self.assertEqual(dy_dx, 6.0)
-
-  @test_combinations.generate(test_combinations.combine(
-      loss_scale=[loss_scale_module.FixedLossScale,
-                  loss_scale_module.DynamicLossScale],
-      strategy_fn=[default_strategy_fn, create_mirrored_strategy],
-      use_tf_function=[True, False]
-  ))
-  def test_output_gradients(self, loss_scale, strategy_fn, use_tf_function):
-    loss_scale = loss_scale(32)
-    strategy = strategy_fn()
-    with strategy.scope():
-      x = variables.Variable(3.0)
-    def run_fn():
-      with lsgt.LossScaleGradientTape(loss_scale) as g:
-        y = x * x
-      return g.gradient(y, x, output_gradients=constant_op.constant(2.0))
-    dy_dx_list = self._run_with_strategy(run_fn, strategy, use_tf_function)
-    self.assertEqual(loss_scale(), 32)
-    for dy_dx in dy_dx_list:
-      self.assertEqual(dy_dx, 12.0)
-
-  @test_combinations.generate(test_combinations.combine(
-      loss_scale=[loss_scale_module.FixedLossScale,
-                  loss_scale_module.DynamicLossScale],
-      strategy_fn=[default_strategy_fn, create_mirrored_strategy],
-      use_tf_function=[True, False]
-  ))
-  def test_multiple_source_types(self, loss_scale, strategy_fn,
-                                 use_tf_function):
-    loss_scale = loss_scale(32)
-    strategy = strategy_fn()
-    with strategy.scope():
-      x1 = variables.Variable(1.0)  # Distributed variable
-      x2 = variables.Variable([1.0, 2.0])  # Distributed non-scalar variable
-    x3 = variables.Variable(2.0)  # Non-distributed variable
-    x4 = constant_op.constant(2.0)  # Tensor
-    def run_fn():
-      with lsgt.LossScaleGradientTape(loss_scale) as g:
-        g.watch(x4)
-        y = x1 * x2 * x3 * x4
-      return g.gradient(y, [x1, x2, x3, x4])
-    x1g, x2g, x3g, x4g = self._run_with_strategy(run_fn, strategy,
-                                                 use_tf_function)
-    self.assertEqual(loss_scale(), 32)
-    for dy_dx1 in x1g:
-      self.assertEqual(dy_dx1, 12.0)
-    for dy_dx2 in x2g:
-      self.assertAllEqual(dy_dx2, [4.0, 4.0])
-    for dy_dx3 in x3g:
-      self.assertEqual(dy_dx3, 6.0)
-    for dy_dx4 in x4g:
-      self.assertEqual(dy_dx4, 6.0)
-
-  @test_combinations.generate(test_combinations.combine(
-      loss_scale=[loss_scale_module.FixedLossScale,
-                  loss_scale_module.DynamicLossScale],
-      strategy_fn=[default_strategy_fn, create_mirrored_strategy],
-      use_tf_function=[True, False]
-  ))
-  def test_loss_scale_of_one(self, loss_scale, strategy_fn,
-                             use_tf_function):
-    loss_scale = loss_scale(1)
-    strategy = strategy_fn()
-    with strategy.scope():
-      x = variables.Variable(3.0)
-    def run_fn():
-      with lsgt.LossScaleGradientTape(loss_scale) as g:
-        y = x * x
-      return g.gradient(y, x)
-    dy_dx_list = self._run_with_strategy(run_fn, strategy, use_tf_function)
-    self.assertEqual(loss_scale(), 1)
-    for dy_dx in dy_dx_list:
-      self.assertEqual(dy_dx, 6.0)
-
-  @test_combinations.generate(test_combinations.combine(
-      loss_scale=[loss_scale_module.FixedLossScale,
-                  loss_scale_module.DynamicLossScale],
-      strategy_fn=[default_strategy_fn],
-      use_tf_function=[True, False],
-      share_loss_scale=[True, False]
-  ))
-  def test_nested_tapes(self, loss_scale, strategy_fn, use_tf_function,
-                        share_loss_scale):
-    # TODO(reedwm): Support nested tapes with mirrored strategy. Currently this
-    # does not work, as the set of active gradient tapes is a thread-local
-    # variable. Mirrored strategy spawns new threads, making the outer gradient
-    # tape non-active when using the inner gradient tape.
-    outer_loss_scale = loss_scale(32)
-    inner_loss_scale = outer_loss_scale if share_loss_scale else loss_scale(32)
-    strategy = strategy_fn()
-    with strategy.scope():
-      x = variables.Variable(3.0)
-    def run_fn():
-      with lsgt.LossScaleGradientTape(outer_loss_scale) as g:
-        with lsgt.LossScaleGradientTape(inner_loss_scale) as gg:
-          y = x * x
-        dy_dx = gg.gradient(y, x)
-      d2y_dx2 = g.gradient(dy_dx, x)
-      return dy_dx, d2y_dx2
-
-    dy_dx_list, d2y_dx2_list = self._run_with_strategy(run_fn, strategy_fn(),
-                                                       use_tf_function)
-    self.assertEqual(outer_loss_scale(), 32)
-    self.assertEqual(inner_loss_scale(), 32)
-    for dy_dx in dy_dx_list:
-      self.assertEqual(dy_dx, 6.0)
-    for d2y_dx2 in d2y_dx2_list:
-      self.assertEqual(d2y_dx2, 2.0)
-
-  def test_non_persistent_tapes_error(self):
-    x = variables.Variable(3.0)
-    with lsgt.LossScaleGradientTape(loss_scale_module.FixedLossScale(32),
-                                    persistent=False) as g:
-      y = x * x
-      z = y * y
-    g.gradient(z, x)
-    with self.assertRaisesRegex(RuntimeError, 'persistent'):
-      g.gradient(y, x)
-
-  @test_combinations.generate(test_combinations.combine(
-      loss_scale=[loss_scale_module.FixedLossScale,
-                  loss_scale_module.DynamicLossScale],
-      strategy_fn=[default_strategy_fn, create_mirrored_strategy],
-      use_tf_function=[True, False]
-  ))
-  def test_persistent_tapes(self, loss_scale, strategy_fn, use_tf_function):
-    ls = loss_scale(32)
-    strategy = strategy_fn()
-    with strategy.scope():
-      x = variables.Variable(3.0)
-    def run_fn():
-      with lsgt.LossScaleGradientTape(ls, persistent=True) as g:
-        y = x * x
-        z = y * y
-      dz_dx = g.gradient(z, x)
-      dy_dx = g.gradient(y, x)
-      return dz_dx, dy_dx
-
-    dz_dx_list, dy_dx_list = self._run_with_strategy(run_fn, strategy,
-                                                     use_tf_function)
-    for dz_dx in dz_dx_list:
-      self.assertEqual(dz_dx, 108.0)
-    for dy_dx in dy_dx_list:
-      self.assertEqual(dy_dx, 6.0)
-
-  @test_combinations.generate(test_combinations.combine(
-      loss_scale=[loss_scale_module.FixedLossScale,
-                  loss_scale_module.DynamicLossScale],
-  ))
-  def test_nested_sources(self, loss_scale):
-    x = (variables.Variable(19.0), (variables.Variable(8.),
-                                    variables.Variable(9.)))
-    with lsgt.LossScaleGradientTape(loss_scale(32)) as g:
-      y = x * 13
-    dy_dx = g.gradient(y, x)
-    self.assertEqual(self.evaluate(dy_dx), (13., (13., 13.)))
-
-  @test_combinations.generate(test_combinations.combine(
-      loss_scale=[loss_scale_module.FixedLossScale,
-                  loss_scale_module.DynamicLossScale],
-  ))
-  def test_nested_targets(self, loss_scale):
-    w = variables.Variable(3.0)
-    with lsgt.LossScaleGradientTape(loss_scale(32)) as g:
-      x = w * 5
-      y = w * 7
-      z = w * 11
-    grad = g.gradient([x, (y, z)], w)
-    self.assertEqual(self.evaluate(grad), 23)
-
-  @test_combinations.generate(test_combinations.combine(
-      loss_scale=[loss_scale_module.FixedLossScale,
-                  loss_scale_module.DynamicLossScale],
-      strategy_fn=[default_strategy_fn, create_mirrored_strategy]
-  ))
-  def test_different_dtypes(self, loss_scale, strategy_fn):
-    loss_scale = loss_scale(32)
-    strategy = strategy_fn()
-    with strategy.scope():
-      x1 = variables.Variable(1.0, dtype='float16')
-      x2 = variables.Variable(2.0, dtype='float32')
-      x3 = variables.Variable(3.0, dtype='float64')
-    def run_fn():
-      with lsgt.LossScaleGradientTape(loss_scale) as g:
-        y1 = x1 * math_ops.cast(x2, 'float16') * math_ops.cast(x3, 'float16')
-        y2 = math_ops.cast(x1, 'float32') * x2 * math_ops.cast(x3, 'float32')
-        y3 = math_ops.cast(x1, 'float64') * math_ops.cast(x2, 'float64') * x3
-      return g.gradient([y1, y2, y3], [x1, x2, x3])
-    dy_dx1_list, dy_dx2_list, dy_dx3_list = self._run_with_strategy(
-        run_fn, strategy)
-    self.assertEqual(loss_scale(), 32)
-    for dy_dx1 in dy_dx1_list:
-      self.assertEqual(dy_dx1, 18.0)
-      self.assertEqual(dy_dx1.dtype, 'float16')
-    for dy_dx2 in dy_dx2_list:
-      self.assertEqual(dy_dx2, 9.0)
-      self.assertEqual(dy_dx2.dtype, 'float32')
-    for dy_dx3 in dy_dx3_list:
-      self.assertEqual(dy_dx3, 6.0)
-      self.assertEqual(dy_dx3.dtype, 'float64')
-
-  @test_combinations.generate(test_combinations.combine(
-      loss_scale=[loss_scale_module.FixedLossScale,
-                  loss_scale_module.DynamicLossScale],
-      strategy_fn=[default_strategy_fn, create_mirrored_strategy],
-      use_tf_function=[True, False]
-  ))
-  def test_none_gradients(self, loss_scale, strategy_fn, use_tf_function):
-    loss_scale = loss_scale(32)
-    strategy = strategy_fn()
-    with strategy.scope():
-      x1 = variables.Variable(2.0)
-      x2 = variables.Variable(2.0)
-      x3 = variables.Variable(2.0)
-      x4 = variables.Variable([2.0, 2.0])
-      x5 = constant_op.constant(2.0)
-      x6 = constant_op.constant(2.0)
-    def run_fn():
-      with lsgt.LossScaleGradientTape(loss_scale) as g:
-        # x6 will have a None gradient because we do not watch it
-        g.watch(x5)
-        y = x1 * x3 * x5 * x6
-      return g.gradient(y, [x1, x2, [x3, [x4], x5], x6])
-    [x1g, x2g, [x3g, [x4g], x5g], x6g] = self._run_with_strategy(
-        run_fn, strategy, use_tf_function)
-    self.assertEqual(loss_scale(), 32)
-    for dy_dx1 in x1g:
-      self.assertEqual(dy_dx1, 8.0)
-    self.assertEqual(x2g, [None])
-    for dy_dx3 in x3g:
-      self.assertEqual(dy_dx3, 8.0)
-    self.assertEqual(x4g, [None])
-    for dy_dx5 in x5g:
-      self.assertEqual(dy_dx5, 8.0)
-    self.assertEqual(x6g, [None])
-
-  @test_combinations.generate(test_combinations.combine(
-      loss_scale=[loss_scale_module.FixedLossScale,
-                  loss_scale_module.DynamicLossScale],
-      strategy_fn=[default_strategy_fn, create_mirrored_strategy],
-      use_tf_function=[True, False]
-  ))
-  def test_zero_gradients(self, loss_scale, strategy_fn, use_tf_function):
-    loss_scale = loss_scale(32)
-    strategy = strategy_fn()
-    with strategy.scope():
-      x = variables.Variable(0.0)
-    def run_fn():
-      with lsgt.LossScaleGradientTape(loss_scale) as g:
-        y = x * x
-      return g.gradient(y, x)
-    dy_dx_list = self._run_with_strategy(run_fn, strategy, use_tf_function)
-    self.assertEqual(loss_scale(), 32)
-    for dy_dx in dy_dx_list:
-      # Assert zero gradients are not turned into Nones
-      self.assertEqual(dy_dx, 0.0)
-
-  @test_combinations.generate(test_combinations.combine(
-      loss_scale=[loss_scale_module.FixedLossScale,
-                  loss_scale_module.DynamicLossScale],
-      strategy_fn=[default_strategy_fn, create_mirrored_strategy],
-      non_finite_term=[np.inf, np.nan],
-  ))
-  def test_scaling_non_finite_gradient(self, loss_scale, strategy_fn,
-                                       non_finite_term):
-    loss_scale = loss_scale(32)
-    x = variables.Variable(1.0)
-    def run_fn():
-      with lsgt.LossScaleGradientTape(loss_scale) as g:
-        y = x * non_finite_term
-      return g.gradient(y, x)
-
-    dy_dx_list = self._run_with_strategy(run_fn, strategy_fn())
-    check_fn = np.isposinf if non_finite_term == np.inf else np.isnan
-    for dy_dx in dy_dx_list:
-      self.assertTrue(check_fn(dy_dx))
-
-  @test_combinations.generate(test_combinations.combine(
-      strategy_fn=[default_strategy_fn, create_mirrored_strategy],
-      non_finite_term=[np.inf, np.nan],
-      use_tf_function=[True, False],
-  ))
-  def test_dynamic_scale_to_one_on_non_finite_gradient(
-      self, strategy_fn, non_finite_term, use_tf_function):
-    loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32)
-    strategy = strategy_fn()
-    with strategy.scope():
-      x = variables.Variable(3.0)
-    def run_fn():
-      with lsgt.LossScaleGradientTape(loss_scale) as g:
-        y = x * non_finite_term
-      g.gradient(y, x)
-
-    self._run_with_strategy(run_fn, strategy, use_tf_function)
-    self.assertEqual(self.evaluate(loss_scale()), 1.0)
-
-  @test_combinations.generate(test_combinations.combine(
-      use_tf_function=[True, False],
-  ))
-  def test_dynamic_scale_to_one_on_non_finite_gradient_on_last_replica(
-      self, use_tf_function):
-    if context.num_gpus() < 1:
-      # Requires the mirrored strategy to have two replicas: one on the CPU and
-      # one on the GPU
-      self.skipTest('Test requires at least 1 GPU')
-    loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32)
-    strategy = create_mirrored_strategy()
-    with strategy.scope():
-      x = variables.Variable(3.0)
-    def run_fn():
-      with lsgt.LossScaleGradientTape(loss_scale) as g:
-        # The gradient will be finite on the first replica, and infinite on the
-        # second
-        rep_ctx = distribution_strategy_context.get_replica_context()
-        if rep_ctx.replica_id_in_sync_group == rep_ctx.num_replicas_in_sync - 1:
-          y = x * np.inf
-        else:
-          y = x * 2
-      return g.gradient(y, x)
-
-    replica0_grad, replica1_grad = self._run_with_strategy(
-        run_fn, strategy, use_tf_function)
-    self.assertEqual(self.evaluate(loss_scale()), 1.0)
-    self.assertEqual(replica0_grad, 2.0)
-    self.assertEqual(replica1_grad, np.inf)
-
-  @test_combinations.generate(test_combinations.combine(
-      strategy_fn=[default_strategy_fn, create_mirrored_strategy],
-      non_finite_term=[np.inf, np.nan],
-  ))
-  def test_fixed_scaling_no_change_non_finite_gradient(self, strategy_fn,
-                                                       non_finite_term):
-    loss_scale = loss_scale_module.FixedLossScale(32)
-    strategy = strategy_fn()
-    with strategy.scope():
-      x = variables.Variable(3.0)
-    def run_fn():
-      with lsgt.LossScaleGradientTape(loss_scale) as g:
-        y = x * non_finite_term
-      return g.gradient(y, x)
-
-    dy_dx_list = self._run_with_strategy(run_fn, strategy)
-    check_fn = np.isposinf if non_finite_term == np.inf else np.isnan
-    for dy_dx in dy_dx_list:
-      self.assertTrue(check_fn(self.evaluate(dy_dx)))
-    self.assertEqual(self.evaluate(loss_scale()), 32.0)
-
-  @test_combinations.generate(test_combinations.combine(
-      strategy_fn=[default_strategy_fn, create_mirrored_strategy],
-      use_tf_function=[True, False]
-  ))
-  def test_dynamic_loss_scaling_down_loop(self, strategy_fn, use_tf_function):
-    loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32)
-    strategy = strategy_fn()
-    with strategy.scope():
-      x = variables.Variable(3.0)
-    def run_fn():
-      with lsgt.LossScaleGradientTape(loss_scale) as g:
-        y = x * (3.0 * (10**37))  # grad will be inf after scaling
-      return g.gradient(y, x)
-
-    dy_dx_list = self._run_with_strategy(run_fn, strategy, use_tf_function)
-    self.assertEqual(self.evaluate(loss_scale()), 8.0)
-    for dy_dx in dy_dx_list:
-      self.assertAllClose(self.evaluate(dy_dx), (3.0 * (10**37)), atol=1e-06)
-
-  @test_combinations.generate(test_combinations.combine(
-      strategy_fn=[default_strategy_fn, create_mirrored_strategy],
-      use_tf_function=[True, False]
-  ))
-  def test_dynamic_loss_scaling_inf_target_post_scale(self, strategy_fn,
-                                                      use_tf_function):
-    loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32.0)
-    strategy = strategy_fn()
-    with strategy.scope():
-      x = variables.Variable(3.0 * (10**37))
-    def run_fn():
-      with lsgt.LossScaleGradientTape(loss_scale) as g:
-        y = x * 3.0  # target will be inf after scaling
-      return g.gradient(y, x)
-
-    dy_dx_list = self._run_with_strategy(run_fn, strategy, use_tf_function)
-    self.assertEqual(self.evaluate(loss_scale()), 32.0)
-    for dy_dx in dy_dx_list:
-      self.assertAllClose(self.evaluate(dy_dx), 3.0)
-
-  @test_combinations.generate(
-      test_combinations.combine(
-          loss_scale=[
-              loss_scale_module.FixedLossScale,
-              loss_scale_module.DynamicLossScale
-          ],
-          strategy_fn=[default_strategy_fn, create_mirrored_strategy],
-          use_tf_function=[True, False]))
-  def test_transpose(self, loss_scale, strategy_fn, use_tf_function):
-    # Calling tf.transpose insde a tf.function can cause static shape
-    # information to be lost. This tests that LossScaleGradientTape can handle
-    # this.
-    loss_scale = loss_scale(32)
-    strategy = strategy_fn()
-    with strategy.scope():
-      x = variables.Variable(array_ops.ones((2, 3)))
-
-    def run_fn():
-      with lsgt.LossScaleGradientTape(loss_scale) as g:
-        y = array_ops.transpose(x) * 2.
-      return g.gradient(y, x)
-
-    dy_dx_list = self._run_with_strategy(run_fn, strategy, use_tf_function)
-    self.assertEqual(loss_scale(), 32)
-    for dy_dx in dy_dx_list:
-      self.assertAllEqual(dy_dx, np.full((2, 3), 2.))
-
-  def test_passing_non_loss_scale_raises_error(self):
-    with self.assertRaisesRegex(
-        ValueError,
-        '`loss_scale` must be an instance of LossScale, but got: 2.0'):
-      lsgt.LossScaleGradientTape(2.0)
-
-  def test_jacobian_raises_error(self):
-    loss_scale = loss_scale_module.FixedLossScale(2.)
-    x = variables.Variable([1.0, 2.0])
-    with lsgt.LossScaleGradientTape(loss_scale) as g:
-      y = x * 2
-    with self.assertRaisesRegex(
-        NotImplementedError,
-        'LossScaleGradientTape.jacobian is not yet implemented'):
-      g.jacobian(y, x)
-
-    x = variables.Variable([[1.0, 2.0], [3.0, 4.0]])
-    with lsgt.LossScaleGradientTape(loss_scale) as g:
-      y = x * 2
-    with self.assertRaisesRegex(
-        NotImplementedError,
-        'LossScaleGradientTape.batch_jacobian is not yet implemented'):
-      g.batch_jacobian(y, x)
-
-
-if __name__ == '__main__':
-  v2_compat.enable_v2_behavior()
-  test.main()
diff --git a/tensorflow/python/training/tracking/asset.py b/tensorflow/python/training/tracking/asset.py
index 56a4907b178a8a..0fd10c4a35a898 100644
--- a/tensorflow/python/training/tracking/asset.py
+++ b/tensorflow/python/training/tracking/asset.py
@@ -83,9 +83,9 @@ def asset_path(self):
     return self._path
 
   @classmethod
-  def _deserialize_from_proto(cls, proto, export_dir, asset_file_def,
+  def _deserialize_from_proto(cls, object_proto, export_dir, asset_file_def,
                               **unused_kwargs):
-    proto = proto.asset
+    proto = object_proto.asset
     filename = file_io.join(
         saved_model_utils.get_assets_dir(export_dir),
         asset_file_def[proto.asset_file_def_index].filename)
diff --git a/tensorflow/python/training/tracking/resource.py b/tensorflow/python/training/tracking/resource.py
index 8c85dbcedbf126..283455048c9f4e 100644
--- a/tensorflow/python/training/tracking/resource.py
+++ b/tensorflow/python/training/tracking/resource.py
@@ -273,3 +273,49 @@ def __init__(self, device=""):
       resource_tracker.add_resource(self)
     super(TrackableResource, self).__init__(device=device)
 
+
+# TODO(b/124205571,b/124092991): Solve destruction of resources.
+class RestoredResource(TrackableResource):
+  """Restored SavedResource."""
+
+  def __init__(self, device=""):
+    super(RestoredResource, self).__init__(device=device)
+
+  def _create_resource(self):
+    raise RuntimeError()
+
+  def _initialize(self):
+    raise RuntimeError()
+
+  # _list_functions_for_serialization expects Function objects, but unlike
+  # _create_resource and _initialize, _destroy_resource didn't always exist in
+  # older TrackableResource implementations, so this default stub must be a
+  # Function.
+  @def_function.function
+  def _destroy_resource(self):
+    raise RuntimeError()
+
+  def _list_functions_for_serialization(self, unused_serialization_cache):
+    # Overwrite this method to avoid the implementation of
+    # base class to re-wrap the polymorphic functions into
+    # another layer of `tf.function`.
+    functions = {
+        "_create_resource": self._create_resource,
+        "_initialize": self._initialize,
+        "_destroy_resource": self._destroy_resource,
+    }
+    return functions
+
+  @classmethod
+  def _deserialize_from_proto(cls, object_proto, dependencies, **unused_kwargs):
+    obj = cls(device=object_proto.resource.device)
+    resource_creator = dependencies.get("_create_resource")
+    if resource_creator is not None:
+      obj._create_resource = resource_creator  # pylint: disable=protected-access
+    return obj
+
+  def _add_trackable_child(self, name, value):
+    setattr(self, name, value)
+    if (isinstance(value, base.Trackable) and
+        not isinstance(value, def_function.Function)):
+      self._track_trackable(value, name)
diff --git a/tensorflow/python/training/tracking/trackable_utils.py b/tensorflow/python/training/tracking/trackable_utils.py
index 8f1f207b7f8e2a..6353ec341aec7a 100644
--- a/tensorflow/python/training/tracking/trackable_utils.py
+++ b/tensorflow/python/training/tracking/trackable_utils.py
@@ -78,7 +78,7 @@ def order_by_dependency(dependency_map):
   while to_visit:
     x = to_visit.pop(0)
     reversed_dependency_arr.append(x)
-    for dep in dependency_map[x]:
+    for dep in set(dependency_map[x]):
       edges = reverse_dependency_map[dep]
       edges.remove(x)
       if not edges:
diff --git a/tensorflow/python/training/tracking/util.py b/tensorflow/python/training/tracking/util.py
index 16efdff9fcf26a..1915f9566d4d8b 100644
--- a/tensorflow/python/training/tracking/util.py
+++ b/tensorflow/python/training/tracking/util.py
@@ -1252,11 +1252,13 @@ def save(self, file_prefix, checkpoint_number=None, session=None,
       feed_dict[file_prefix_tensor] = file_prefix
     else:
       with ops.device("/cpu:0"):
-        file_prefix_tensor = constant_op.constant(
+        file_prefix_tensor = ops.convert_to_tensor(
             file_prefix, dtype=dtypes.string)
       object_graph_tensor = None
 
-    file_io.recursive_create_dir(os.path.dirname(file_prefix))
+    if not tensor_util.is_tensor(file_prefix):
+      file_io.recursive_create_dir(os.path.dirname(file_prefix))
+
     save_path, new_feed_additions = self._save_cached_when_graph_building(
         file_prefix_tensor, object_graph_tensor, options)
     if new_feed_additions:
diff --git a/tensorflow/python/training/tracking/util_test.py b/tensorflow/python/training/tracking/util_test.py
index 3df97a3a2a96f9..fb4c835b74533d 100644
--- a/tensorflow/python/training/tracking/util_test.py
+++ b/tensorflow/python/training/tracking/util_test.py
@@ -741,7 +741,8 @@ def test_restore_after_adding_empty_trackable_data_structure(self):
     load_status.assert_existing_objects_matched().run_restore_ops()
 
   @test_util.run_in_graph_and_eager_modes
-  def test_write_checkpoint_from_function(self):
+  def test_write_checkpoint_path_str_from_function(self):
+
     checkpoint_prefix = os.path.join(self.get_temp_dir(), "ckpt")
     save_checkpoint = trackable_utils.Checkpoint(v=variables_lib.Variable(1.))
 
@@ -769,6 +770,58 @@ def _write_checkpoint():
     status.run_restore_ops()
     self.assertEqual(3., self.evaluate(load_checkpoint.v))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_write_checkpoint_path_tensor_from_function(self):
+    # Same as the previous test, but the path is a tensor not a python string.
+    checkpoint_prefix = os.path.join(self.get_temp_dir(), "ckpt")
+
+    checkpoint_prefix_tensor = constant_op.constant(checkpoint_prefix)
+
+    save_checkpoint = trackable_utils.Checkpoint(v=variables_lib.Variable(1.))
+
+    @def_function.function
+    def _write_checkpoint(prefix):
+      save_path = save_checkpoint.write(prefix)
+      return save_path
+
+    self.evaluate([save_checkpoint.v.initializer])
+    self.evaluate(_write_checkpoint(checkpoint_prefix_tensor))
+    load_checkpoint = trackable_utils.Checkpoint(v=variables_lib.Variable(0.))
+    # Use read() instead of restore() which allows us to check that all
+    # existing objects were loaded.
+    status = load_checkpoint.read(checkpoint_prefix)
+    status.assert_existing_objects_matched()
+    status.assert_consumed()
+    status.run_restore_ops()
+    self.assertEqual(1., self.evaluate(load_checkpoint.v))
+    self.evaluate(save_checkpoint.v.assign(3.))
+    self.evaluate(_write_checkpoint(checkpoint_prefix_tensor))
+    self.evaluate(save_checkpoint.v.assign(0.))
+    status = load_checkpoint.read(checkpoint_prefix)
+    status.assert_existing_objects_matched()
+    status.assert_consumed()
+    status.run_restore_ops()
+    self.assertEqual(3., self.evaluate(load_checkpoint.v))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_write_checkpoint_path_tensor_does_not_exist_from_function(self):
+    # Same as the previous test, but the path is a tensor not a python string.
+    checkpoint_prefix = os.path.join(
+        self.get_temp_dir(), "DOES_NOT_EXIST", "ckpt")
+
+    checkpoint_prefix_tensor = constant_op.constant(checkpoint_prefix)
+
+    save_checkpoint = trackable_utils.Checkpoint(v=variables_lib.Variable(1.))
+
+    @def_function.function
+    def _write_checkpoint(prefix):
+      save_path = save_checkpoint.write(prefix)
+      return save_path
+
+    self.evaluate([save_checkpoint.v.initializer])
+    with self.assertRaises(errors_impl.NotFoundError):
+      self.evaluate(_write_checkpoint(checkpoint_prefix_tensor))
+
   def test_inititialize_with_data_structures(self):
     checkpoint = trackable_utils.Checkpoint(
         a=[variables_lib.Variable(0.), variables_lib.Variable(1.)],
diff --git a/tensorflow/python/util/BUILD b/tensorflow/python/util/BUILD
index 3d42eaf8eda2da..128b370632fd20 100644
--- a/tensorflow/python/util/BUILD
+++ b/tensorflow/python/util/BUILD
@@ -369,6 +369,7 @@ tf_py_test(
     name = "tf_stack_test",
     srcs = ["tf_stack_test.py"],
     python_version = "PY3",
+    tags = ["no_oss_py310"],  # b/207152102
     deps = [
         ":tf_export",
         ":tf_stack",
@@ -524,6 +525,7 @@ tf_py_test(
     srcs = ["nest_test.py"],
     main = "nest_test.py",
     python_version = "PY3",
+    tags = ["no_oss_py310"],  # b/207151276
     deps = [":nest_test_main_lib"],
 )
 
diff --git a/tensorflow/python/util/lock_util.py b/tensorflow/python/util/lock_util.py
index a73a23bbb72c41..6832011e155093 100644
--- a/tensorflow/python/util/lock_util.py
+++ b/tensorflow/python/util/lock_util.py
@@ -99,7 +99,7 @@ def release(self, group_id):
     self._ready.acquire()
     self._group_member_counts[group_id] -= 1
     if self._group_member_counts[group_id] == 0:
-      self._ready.notifyAll()
+      self._ready.notify_all()
     self._ready.release()
 
   def _another_group_active(self, group_id):
diff --git a/tensorflow/stream_executor/cuda/cuda_asm_compiler.cc b/tensorflow/stream_executor/cuda/cuda_asm_compiler.cc
index 64b8ebbc543d24..a76e718f96c6f8 100644
--- a/tensorflow/stream_executor/cuda/cuda_asm_compiler.cc
+++ b/tensorflow/stream_executor/cuda/cuda_asm_compiler.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <tuple>
-
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/stream_executor/gpu/asm_compiler.h"
 #include "tensorflow/stream_executor/gpu/gpu_diagnostics.h"
@@ -37,24 +35,20 @@ namespace stream_executor {
 
 port::StatusOr<std::vector<uint8>> LinkGpuAsm(
     gpu::GpuContext* context, std::vector<CubinOrPTXImage> images) {
-  const bool linking_supported = [] {
+  const port::Status linking_supported = [] {
     if (CUDA_VERSION < 11030) {
-      return true;
-    }
-    auto version_or_status = gpu::Diagnostician::FindKernelDriverVersion();
-    if (!version_or_status.ok()) {
-      LOG(WARNING) << "Couldn't read CUDA driver version.";
-      return false;
+      return port::Status::OK();
     }
-    std::tuple<int, int, int> version = *version_or_status;
-    if (CUDA_VERSION < 11040) return version >= std::make_tuple(465, 19, 1);
-    if (CUDA_VERSION < 11050) return version >= std::make_tuple(470, 82, 1);
-    return version >= std::make_tuple(495, 29, 5);
+    int driver_cuda_version;
+    // Get the highest version of CUDA supported by this driver.
+    RETURN_IF_CUDA_ERROR(cuDriverGetVersion(&driver_cuda_version));
+    return driver_cuda_version >= CUDA_VERSION
+               ? port::Status::OK()
+               : tensorflow::errors::Unimplemented(
+                     "CUDA version unsupported by NVIDIA driver version.");
   }();
 
-  if (!linking_supported) {
-    return tensorflow::errors::Unimplemented("Linking is unsupported");
-  }
+  TF_RETURN_IF_ERROR(linking_supported);
 
   gpu::ScopedActivateContext activation(context);
 
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 210239cd6f3aaa..fac20f0f393389 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -219,11 +219,20 @@ class ScopedCublasMathMode {
 };
 #endif  // CUDA_VERSION >= 9000
 
+static const char *const kCublasNotInitializedExplanation =
+    "Failure to initialize cublas may be due to OOM (cublas needs some free "
+    "memory when you initialize it, and your deep-learning framework may have "
+    "preallocated more than its fair share), or may be because this binary was "
+    "not built with support for the GPU in your machine.";
+
 bool CUDABlas::Init() {
   gpu::ScopedActivateExecutorContext sac{parent_};
   cublasStatus_t ret = cublasCreate(&blas_);
   if (ret != CUBLAS_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to create cublas handle: " << ToString(ret);
+    if (ret == CUBLAS_STATUS_NOT_INITIALIZED) {
+      LOG(ERROR) << kCublasNotInitializedExplanation;
+    }
     return false;
   }
 
@@ -231,6 +240,9 @@ bool CUDABlas::Init() {
   ret = cublasLtCreate(&blasLt_);
   if (ret != CUBLAS_STATUS_SUCCESS) {
     LOG(ERROR) << "failed to create cublasLt handle: " << ToString(ret);
+    if (ret == CUBLAS_STATUS_NOT_INITIALIZED) {
+      LOG(ERROR) << kCublasNotInitializedExplanation;
+    }
     return false;
   }
 #endif  // CUDA_VERSION >= 11000
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 43e59bb6e57970..e0a6050abc7a18 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -25,8 +25,10 @@ limitations under the License.
 #include <functional>
 #include <limits>
 #include <memory>
+#include <string>
 #include <tuple>
 #include <type_traits>
+#include <utility>
 
 #include "google/protobuf/wrappers.pb.h"
 #include "absl/types/optional.h"
@@ -783,12 +785,12 @@ class AlgorithmDesc {
   typedef int64_t Index;
   AlgorithmDesc() : AlgorithmDesc(0, false, absl::nullopt) {}
   explicit AlgorithmDesc(AlgorithmProto proto) : proto_(std::move(proto)) {}
-  AlgorithmDesc(Index a, bool use_tensor_ops)
-      : AlgorithmDesc(a, use_tensor_ops, absl::nullopt) {}
-  AlgorithmDesc(Index a, bool use_tensor_ops,
+  AlgorithmDesc(Index algo_id, bool use_tensor_ops)
+      : AlgorithmDesc(algo_id, use_tensor_ops, absl::nullopt) {}
+  AlgorithmDesc(Index algo_id, bool use_tensor_ops,
                 absl::optional<uint64_t> workspace_size) {
     proto_.set_is_cudnn_frontend(false);
-    proto_.set_algo_id(a);
+    proto_.set_algo_id(algo_id);
     proto_.set_math_type(use_tensor_ops ? AlgorithmProto::TENSOR_OP_MATH
                                         : AlgorithmProto::DEFAULT_MATH);
     if (workspace_size) {
diff --git a/tensorflow/stream_executor/rocm/rocsolver_wrapper.h b/tensorflow/stream_executor/rocm/rocsolver_wrapper.h
index fcdf1ecd102aa9..c13a778d5590f4 100644
--- a/tensorflow/stream_executor/rocm/rocsolver_wrapper.h
+++ b/tensorflow/stream_executor/rocm/rocsolver_wrapper.h
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -98,7 +98,7 @@ namespace wrap {
   __macro(rocsolver_cunmqr)                 \
   __macro(rocsolver_zunmqr)                \
   __macro(rocsolver_cungqr)                \
-  __macro(rocsolver_zungqr)                
+  __macro(rocsolver_zungqr)
 // clang-format on
 
 FOREACH_ROCSOLVER_API(ROCSOLVER_API_WRAPPER)
diff --git a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
index b758d6a1ef2fa6..563406ccb373b6 100644
--- a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
+++ b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
@@ -291,6 +291,10 @@ void TpuHostLocation_Cores(SE_TpuTopology_Host* tpu_host_location,
                            TpuCoreTypeEnum tpu_core_type,
                            SE_TpuTopology_Core** cores);
 
+// Async collective offloading.
+void TpuAsyncCollectiveOffloadHelper_Init();
+void TpuAsyncCollectiveOffloadHelper_Shutdown();
+
 // C API for XLA::Compiler interface
 
 TFTPU_CAPI_EXPORT Tpu_Compiler* TpuCompiler_New();
@@ -540,6 +544,9 @@ struct TfTpu_ExecutorApiFn {
 
   TFTPU_ADD_FN_IN_STRUCT(XlaShapeToTpuShapeRepresentation);
   TFTPU_ADD_FN_IN_STRUCT(XlaShapeToTpuPaddedShape);
+
+  TFTPU_ADD_FN_IN_STRUCT(TpuAsyncCollectiveOffloadHelper_Init);
+  TFTPU_ADD_FN_IN_STRUCT(TpuAsyncCollectiveOffloadHelper_Shutdown);
 };
 }
 
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index ec445ab1718a81..f87e655aaeec9d 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -763,6 +763,132 @@ def tf_cc_shared_object(
             visibility = visibility,
         )
 
+# copybara:comment_begin(oss only)
+def tf_cc_shared_library(
+        name,
+        srcs = [],
+        static_deps = [],
+        deps = [],
+        data = [],
+        copts = [],
+        linkopts = lrt_if_needed(),
+        additional_linker_inputs = [],
+        linkstatic = True,
+        framework_so = tf_binary_additional_srcs(),
+        soversion = None,
+        per_os_targets = False,  # TODO(rostam): Should be deprecated.
+        win_def_file = None,
+        visibility = None):
+    """Configures the shared object file for TensorFlow."""
+    if soversion != None:
+        suffix = "." + str(soversion).split(".")[0]
+        longsuffix = "." + str(soversion)
+    else:
+        suffix = ""
+        longsuffix = ""
+
+    if per_os_targets:
+        names = [
+            (
+                pattern % (name, ""),
+                pattern % (name, suffix),
+                pattern % (name, longsuffix),
+            )
+            for pattern in SHARED_LIBRARY_NAME_PATTERNS
+        ]
+    else:
+        names = [(
+            name,
+            name + suffix,
+            name + longsuffix,
+        )]
+
+    for name_os, name_os_major, name_os_full in names:
+        # Windows DLLs cannot be versioned.
+        if name_os.endswith(".dll"):
+            name_os_major = name_os
+            name_os_full = name_os
+
+        soname = name_os_major.split("/")[-1]
+
+        data_extra = []
+        if framework_so != []:
+            data_extra = tf_binary_additional_data_deps()
+
+        cc_library_name = name_os_full + "_cclib"
+        cc_library(
+            name = cc_library_name,
+            srcs = srcs + framework_so,
+            deps = deps,
+            copts = copts,
+            linkstatic = linkstatic,
+            win_def_file = win_def_file,
+        )
+
+        cc_shared_library_name = name_os_full + "_ccsharedlib"
+        shared_lib_name = name_os_full + "_sharedlibname"
+        cc_shared_library(
+            name = cc_shared_library_name,
+            roots = [cc_library_name],
+            static_deps = static_deps,
+            data = data + data_extra,
+            shared_lib_name = shared_lib_name,
+            user_link_flags = linkopts + _rpath_user_link_flags(shared_lib_name) + select({
+                clean_dep("//tensorflow:ios"): [
+                    "-Wl,-install_name,@rpath/" + soname,
+                ],
+                clean_dep("//tensorflow:macos"): [
+                    "-Wl,-install_name,@rpath/" + soname,
+                ],
+                clean_dep("//tensorflow:windows"): [],
+                "//conditions:default": [
+                    "-Wl,-soname," + soname,
+                ],
+            }),
+            additional_linker_inputs = additional_linker_inputs,
+            visibility = visibility,
+        )
+        native.alias(
+            name = shared_lib_name,
+            actual = cc_shared_library_name,
+            visibility = visibility,
+        )
+        filegroup(
+            name = name_os_full,
+            srcs = [shared_lib_name],
+            output_group = "custom_name_shared_library",
+        )
+
+        if name_os != name_os_major:
+            native.genrule(
+                name = name_os + "_sym",
+                outs = [name_os],
+                srcs = [name_os_major],
+                output_to_bindir = 1,
+                cmd = "ln -sf $$(basename $<) $@",
+            )
+            native.genrule(
+                name = name_os_major + "_sym",
+                outs = [name_os_major],
+                srcs = [name_os_full],
+                output_to_bindir = 1,
+                cmd = "ln -sf $$(basename $<) $@",
+            )
+
+    flat_names = [item for sublist in names for item in sublist]
+    if name not in flat_names:
+        native.filegroup(
+            name = name,
+            srcs = select({
+                clean_dep("//tensorflow:windows"): [":%s.dll" % (name)],
+                clean_dep("//tensorflow:macos"): [":lib%s%s.dylib" % (name, longsuffix)],
+                "//conditions:default": [":lib%s.so%s" % (name, longsuffix)],
+            }),
+            visibility = visibility,
+        )
+
+# copybara:comment_end
+
 # Links in the framework shared object
 # (//third_party/tensorflow:libtensorflow_framework.so) when not building
 # statically. Also adds linker options (rpaths) so that the framework shared
@@ -1500,10 +1626,16 @@ def tf_java_test(
         kernels = [],
         *args,
         **kwargs):
+    cc_library_name = name + "_cclib"
+    cc_library(
+        # TODO(b/183579145): Remove when cc_shared_library supports CcInfo or JavaInfo providers .
+        name = cc_library_name,
+        srcs = tf_binary_additional_srcs(fullversion = True) + tf_binary_dynamic_kernel_dsos() + tf_binary_dynamic_kernel_deps(kernels),
+    )
     native.java_test(
         name = name,
         srcs = srcs,
-        deps = deps + tf_binary_additional_srcs(fullversion = True) + tf_binary_dynamic_kernel_dsos() + tf_binary_dynamic_kernel_deps(kernels),
+        deps = deps + [cc_library_name],
         *args,
         **kwargs
     )
@@ -2882,8 +3014,177 @@ def pybind_extension(
         compatible_with = compatible_with,
     )
 
-# buildozer: enable=function-docstring-args
+# copybara:comment_begin(oss only)
+def pybind_ccsharedlib_extension(
+        name,
+        srcs,
+        module_name,
+        hdrs = [],
+        static_deps = [],
+        deps = [],
+        additional_exported_symbols = [],
+        compatible_with = None,
+        copts = [],
+        data = [],
+        defines = [],
+        deprecation = None,
+        features = [],
+        link_in_framework = False,
+        licenses = None,
+        linkopts = [],
+        pytype_deps = [],
+        pytype_srcs = [],
+        restricted_to = None,
+        srcs_version = "PY3",
+        testonly = None,
+        visibility = None):
+    """Builds a generic Python extension module."""
+    _ignore = [module_name]
+    p = name.rfind("/")
+    if p == -1:
+        sname = name
+        prefix = ""
+    else:
+        sname = name[p + 1:]
+        prefix = name[:p + 1]
+    so_file = "%s%s.so" % (prefix, sname)
+    pyd_file = "%s%s.pyd" % (prefix, sname)
+    exported_symbols = [
+        "init%s" % sname,
+        "init_%s" % sname,
+        "PyInit_%s" % sname,
+    ] + additional_exported_symbols
+
+    exported_symbols_file = "%s-exported-symbols.lds" % name
+    version_script_file = "%s-version-script.lds" % name
+
+    exported_symbols_output = "\n".join(["_%s" % symbol for symbol in exported_symbols])
+    version_script_output = "\n".join([" %s;" % symbol for symbol in exported_symbols])
+
+    native.genrule(
+        name = name + "_exported_symbols",
+        outs = [exported_symbols_file],
+        cmd = "echo '%s' >$@" % exported_symbols_output,
+        output_licenses = ["unencumbered"],
+        visibility = ["//visibility:private"],
+        testonly = testonly,
+    )
+
+    native.genrule(
+        name = name + "_version_script",
+        outs = [version_script_file],
+        cmd = "echo '{global:\n%s\n local: *;};' >$@" % version_script_output,
+        output_licenses = ["unencumbered"],
+        visibility = ["//visibility:private"],
+        testonly = testonly,
+    )
 
+    # TODO(rostam): Add libtensorflow_framework.so to `dynamic_deps`.
+    if link_in_framework:
+        srcs += tf_binary_additional_srcs()
+
+    cc_library_name = so_file + "_cclib"
+    cc_library(
+        name = cc_library_name,
+        hdrs = hdrs,
+        srcs = srcs,
+        deps = deps,
+        compatible_with = compatible_with,
+        copts = copts + [
+            "-fno-strict-aliasing",
+            "-fexceptions",
+        ] + select({
+            clean_dep("//tensorflow:windows"): [],
+            "//conditions:default": [
+                "-fvisibility=hidden",
+            ],
+        }),
+        defines = defines,
+        features = features + ["-use_header_modules"],
+        restricted_to = restricted_to,
+    )
+    cc_shared_library_name = name + "_ccsharedlib"
+    cc_shared_library(
+        name = cc_shared_library_name,
+        roots = [cc_library_name],
+        static_deps = static_deps,
+        data = data,
+        additional_linker_inputs = [
+            exported_symbols_file,
+            version_script_file,
+        ],
+        compatible_with = compatible_with,
+        deprecation = deprecation,
+        features = features + ["-use_header_modules"],
+        licenses = licenses,
+        restricted_to = restricted_to,
+        shared_lib_name = so_file,
+        testonly = testonly,
+        user_link_flags = linkopts + _rpath_user_link_flags(name) + select({
+            clean_dep("//tensorflow:macos"): [
+                # TODO: the -w suppresses a wall of harmless warnings about hidden typeinfo symbols
+                # not being exported.  There should be a better way to deal with this.
+                "-Wl,-w",
+                "-Wl,-exported_symbols_list,$(location %s)" % exported_symbols_file,
+            ],
+            clean_dep("//tensorflow:windows"): [],
+            "//conditions:default": [
+                "-Wl,--version-script",
+                "$(location %s)" % version_script_file,
+            ],
+        }),
+        visibility = visibility,
+    )
+    native.alias(
+        name = so_file,
+        actual = cc_shared_library_name,
+        compatible_with = compatible_with,
+        deprecation = deprecation,
+        features = features + ["-use_header_modules"],
+        restricted_to = restricted_to,
+        testonly = testonly,
+        visibility = visibility,
+    )
+
+    # Solution to avoid the error "variable '$<' : more than one input file."
+    filegroup_name = name + "_filegroup"
+    filegroup(
+        name = filegroup_name,
+        srcs = [so_file],
+        output_group = "custom_name_shared_library",
+    )
+
+    native.genrule(
+        name = name + "_pyd_copy",
+        srcs = [filegroup_name],
+        outs = [pyd_file],
+        cmd = "cp $< $@",
+        output_to_bindir = True,
+        visibility = visibility,
+        deprecation = deprecation,
+        restricted_to = restricted_to,
+        compatible_with = compatible_with,
+        testonly = testonly,
+    )
+    native.py_library(
+        name = name,
+        data = select({
+            "@org_tensorflow//tensorflow:windows": [pyd_file],
+            "//conditions:default": [so_file],
+        }) + pytype_srcs,
+        deps = pytype_deps,
+        srcs_version = srcs_version,
+        licenses = licenses,
+        testonly = testonly,
+        visibility = visibility,
+        deprecation = deprecation,
+        restricted_to = restricted_to,
+        compatible_with = compatible_with,
+    )
+
+# copybara:comment_end
+
+# buildozer: enable=function-docstring-args
 def tf_python_pybind_extension(
         name,
         srcs,
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt
index 2e3efbd0ac0c47..71ae8d89baded7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt
@@ -4,6 +4,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'table\', \'max_sequence_length\', \'validate_weights_and_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'table\', \'max_sequence_length\', \'validate_weights_and_indices\', \'output_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
index 5535cd503b312f..db3691a460abe9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
@@ -18,7 +18,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'per_replica_batch_size\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'per_replica_input_shapes\', \'per_replica_batch_size\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "dequeue"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt
index 2e3efbd0ac0c47..71ae8d89baded7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-feature-config.pbtxt
@@ -4,6 +4,6 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'table\', \'max_sequence_length\', \'validate_weights_and_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], "
+    argspec: "args=[\'self\', \'table\', \'max_sequence_length\', \'validate_weights_and_indices\', \'output_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
index 5535cd503b312f..db3691a460abe9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.tpu.experimental.embedding.-t-p-u-embedding.pbtxt
@@ -18,7 +18,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'per_replica_batch_size\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'per_replica_input_shapes\', \'per_replica_batch_size\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "dequeue"
diff --git a/tensorflow/tools/ci_build/devtoolset/build_devtoolset.sh b/tensorflow/tools/ci_build/devtoolset/build_devtoolset.sh
index beb495feb8711a..243ca350e1883e 100755
--- a/tensorflow/tools/ci_build/devtoolset/build_devtoolset.sh
+++ b/tensorflow/tools/ci_build/devtoolset/build_devtoolset.sh
@@ -184,7 +184,7 @@ esac
 # TODO(klimek): Automate linking in all non-gcc / non-kernel include
 # directories.
 mkdir -p "/${TARGET}/usr/include/x86_64-linux-gnu"
-PYTHON_VERSIONS=("python3.6m" "python3.7m" "python3.8" "python3.9" "python3.10")
+PYTHON_VERSIONS=("python3.7m" "python3.8" "python3.9" "python3.10")
 for v in "${PYTHON_VERSIONS[@]}"; do
   ln -s "/usr/local/include/${v}" "/${TARGET}/usr/include/x86_64-linux-gnu/${v}"
 done
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index 063e1f97bace86..093e6f74519560 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -15,7 +15,7 @@
 # ==============================================================================
 
 # Select bazel version.
-BAZEL_VERSION="3.7.2"
+BAZEL_VERSION="4.2.1"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
index 01520124dd9bb5..16546b90ecca33 100755
--- a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
@@ -18,7 +18,7 @@
 # It will compile bazel from source and install it in /usr/local/bin
 
 # Select bazel version.
-BAZEL_VERSION="3.7.2"
+BAZEL_VERSION="4.2.1"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/presubmit/macos/py37_cc/build.sh b/tensorflow/tools/ci_build/presubmit/macos/py37_cc/build.sh
index d3973ca5c3f385..2c4169c079296e 100644
--- a/tensorflow/tools/ci_build/presubmit/macos/py37_cc/build.sh
+++ b/tensorflow/tools/ci_build/presubmit/macos/py37_cc/build.sh
@@ -49,6 +49,9 @@ function run_build () {
     --remote_timeout=600 \
     -- //tensorflow/python/... # ${DEFAULT_BAZEL_TARGETS} -//tensorflow/java/... -//tensorflow/lite/java/...
 
+  # Print build time statistics, including critical path.
+  "${BAZEL_WRAPPER_PATH}" analyze-profile "${KOKORO_ARTIFACTS_DIR}/profile.json"
+
   # Copy log to output to be available to GitHub
   ls -la "$(bazel info output_base)/java.log"
   cp "$(bazel info output_base)/java.log" "${KOKORO_ARTIFACTS_DIR}/"
diff --git a/tensorflow/tools/ci_build/presubmit/ubuntu_16/android/build.sh b/tensorflow/tools/ci_build/presubmit/ubuntu_16/android/build.sh
index 3ec40e1950dd66..9513a60ceee0b8 100644
--- a/tensorflow/tools/ci_build/presubmit/ubuntu_16/android/build.sh
+++ b/tensorflow/tools/ci_build/presubmit/ubuntu_16/android/build.sh
@@ -29,7 +29,7 @@ function run_build () {
   export NDK_HOME=$ANDROID_NDK_HOME
   export ANDROID_SDK_HOME="/opt/android-sdk/current"
   export ANDROID_API_LEVEL="23"
-  export ANDROID_BUILD_TOOLS_VERSION="28.0.0"
+  export ANDROID_BUILD_TOOLS_VERSION="30.0.0"
 
   ANDROID_OUT=android.out
   ANDROID_OUT_TARGET=gen_android_out
diff --git a/tensorflow/tools/ci_build/presubmit/ubuntu_16/cpu_py39_full/build.sh b/tensorflow/tools/ci_build/presubmit/ubuntu_16/cpu_py39_full/build.sh
index 477db4c57d59af..00d62991591557 100644
--- a/tensorflow/tools/ci_build/presubmit/ubuntu_16/cpu_py39_full/build.sh
+++ b/tensorflow/tools/ci_build/presubmit/ubuntu_16/cpu_py39_full/build.sh
@@ -36,6 +36,7 @@ tag_filters="-no_oss,-oss_serial,-gpu,-tpu,-benchmark-test""$(maybe_skip_v1)"
 # Run bazel test command.
 "${BAZEL_WRAPPER_PATH}" \
   test \
+  --profile="${KOKORO_ARTIFACTS_DIR}/profile.json" \
   --config=rbe_cpu_linux \
   --config=rbe_linux_py3 \
   --python_path="/usr/bin/python3.9" \
diff --git a/tensorflow/tools/ci_build/presubmit/ubuntu_16/gpu_py39_full/build.sh b/tensorflow/tools/ci_build/presubmit/ubuntu_16/gpu_py39_full/build.sh
index fa70ce435f39ed..4b2113b77d9de8 100644
--- a/tensorflow/tools/ci_build/presubmit/ubuntu_16/gpu_py39_full/build.sh
+++ b/tensorflow/tools/ci_build/presubmit/ubuntu_16/gpu_py39_full/build.sh
@@ -36,6 +36,7 @@ source tensorflow/tools/ci_build/build_scripts/DEFAULT_TEST_TARGETS.sh
 # Run bazel test command.
 "${BAZEL_WRAPPER_PATH}" \
   test \
+  --profile="${KOKORO_ARTIFACTS_DIR}/profile.json" \
   --config=rbe_linux_cuda_nvcc_py39 \
   --config=tensorflow_testing_rbe_linux \
   --test_tag_filters="${tag_filters}" \
diff --git a/tensorflow/tools/ci_build/presubmit/ubuntu_16/sanity/build.sh b/tensorflow/tools/ci_build/presubmit/ubuntu_16/sanity/build.sh
index 9da4e82458edd9..a45d70d00edce5 100644
--- a/tensorflow/tools/ci_build/presubmit/ubuntu_16/sanity/build.sh
+++ b/tensorflow/tools/ci_build/presubmit/ubuntu_16/sanity/build.sh
@@ -67,6 +67,7 @@ EOF
   "${BAZEL_WRAPPER_PATH}" \
     --host_jvm_args=-Dbazel.DigestFunction=SHA256 \
     test \
+    --profile="${KOKORO_ARTIFACTS_DIR}/profile.json" \
     --test_output=all \
     tensorflow/tools/ci_build:${SANITY_OUT_TARGET}
 
diff --git a/tensorflow/tools/ci_build/release/common.sh b/tensorflow/tools/ci_build/release/common.sh
index 2eb85268d897a7..9a7e8a96a430df 100644
--- a/tensorflow/tools/ci_build/release/common.sh
+++ b/tensorflow/tools/ci_build/release/common.sh
@@ -17,7 +17,7 @@
 
 # Keep in sync with tensorflow_estimator and configure.py.
 # LINT.IfChange
-LATEST_BAZEL_VERSION=3.7.2
+LATEST_BAZEL_VERSION=4.2.1
 # LINT.ThenChange(
 #   //tensorflow/opensource_only/configure.py,
 #   //tensorflow_estimator/google/kokoro/common.sh,
@@ -426,12 +426,13 @@ function test_xml_summary_exit {
   exit "${RETVAL}"
 }
 
+# Note: The Docker-based Ubuntu TF-nightly jobs do not use this list. They use
+# https://github.com/tensorflow/build/blob/master/tf_sig_build_dockerfiles/devel.usertools/wheel_verification.bats
+# instead. See go/tf-devinfra/docker.
 # CPU size
 MAC_CPU_MAX_WHL_SIZE=225M
-LINUX_CPU_MAX_WHL_SIZE=200M
 WIN_CPU_MAX_WHL_SIZE=170M
 # GPU size
-LINUX_GPU_MAX_WHL_SIZE=500M
 WIN_GPU_MAX_WHL_SIZE=345M
 
 function test_tf_whl_size() {
@@ -445,12 +446,6 @@ function test_tf_whl_size() {
     # Check MAC CPU whl size.
     if [[ "$WHL_PATH" == *"-macos"* ]] && [[ $(find $WHL_PATH -type f -size +${MAC_CPU_MAX_WHL_SIZE}) ]]; then
         echo "Mac CPU whl size has exceeded ${MAC_CPU_MAX_WHL_SIZE}. To keep
-within pypi's CDN distribution limit, we must not exceed that threshold."
-      return 1
-    fi
-    # Check Linux CPU whl size.
-    if [[ "$WHL_PATH" == *"-manylinux"* ]] && [[ $(find $WHL_PATH -type f -size +${LINUX_CPU_MAX_WHL_SIZE}) ]]; then
-        echo "Linux CPU whl size has exceeded ${LINUX_CPU_MAX_WHL_SIZE}. To keep
 within pypi's CDN distribution limit, we must not exceed that threshold."
       return 1
     fi
@@ -460,14 +455,7 @@ within pypi's CDN distribution limit, we must not exceed that threshold."
 within pypi's CDN distribution limit, we must not exceed that threshold."
       return 1
     fi
-  # Check GPU whl size
   elif [[ "$WHL_PATH" == *"_gpu"* ]]; then
-    # Check Linux GPU whl size.
-    if [[ "$WHL_PATH" == *"-manylinux"* ]] && [[ $(find $WHL_PATH -type f -size +${LINUX_GPU_MAX_WHL_SIZE}) ]]; then
-        echo "Linux GPU whl size has exceeded ${LINUX_GPU_MAX_WHL_SIZE}. To keep
-within pypi's CDN distribution limit, we must not exceed that threshold."
-      return 1
-    fi
     # Check Windows GPU whl size.
     if [[ "$WHL_PATH" == *"-win"* ]] && [[ $(find $WHL_PATH -type f -size +${WIN_GPU_MAX_WHL_SIZE}) ]]; then
         echo "Windows GPU whl size has exceeded ${WIN_GPU_MAX_WHL_SIZE}. To keep
diff --git a/tensorflow/tools/ci_build/release/common_win.bat b/tensorflow/tools/ci_build/release/common_win.bat
index 539821f2584c1f..c0f6f8d11936e9 100644
--- a/tensorflow/tools/ci_build/release/common_win.bat
+++ b/tensorflow/tools/ci_build/release/common_win.bat
@@ -47,7 +47,7 @@ SET PATH=%CUDNN_INSTALL_PATH%\bin;%PATH%
 @REM Setup Bazel
 @REM
 :: Download Bazel from github and make sure its found in PATH.
-SET BAZEL_VERSION=3.7.2
+SET BAZEL_VERSION=4.2.1
 md C:\tools\bazel\
 wget -q https://github.com/bazelbuild/bazel/releases/download/%BAZEL_VERSION%/bazel-%BAZEL_VERSION%-windows-x86_64.exe -O C:/tools/bazel/bazel.exe
 SET PATH=C:\tools\bazel;%PATH%
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile
index 9364fb65581675..c66107bb102b46 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8-jupyter.Dockerfile
@@ -97,7 +97,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Build and install bazel
-ENV BAZEL_VERSION 3.7.2
+ENV BAZEL_VERSION 4.2.1
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile
index 023fa719a0f8bb..48036db2e217e4 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/arm64v8/devel-cpu-arm64v8.Dockerfile
@@ -97,7 +97,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Build and install bazel
-ENV BAZEL_VERSION 3.7.2
+ENV BAZEL_VERSION 4.2.1
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
index 1b87c68e22bb2b..f4360f1601cb58 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
@@ -94,7 +94,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
index 614b2a92c58b6a..3e677e9f7e7391 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
@@ -94,7 +94,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
index eaea422605770a..8ec526a6e454c0 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
@@ -139,7 +139,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
index b1bb1cd4e64fdd..146735c684e3f5 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
@@ -139,7 +139,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel-jupyter.Dockerfile
index df6108aedab665..0ed45f65003eff 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel-jupyter.Dockerfile
@@ -83,7 +83,7 @@ RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
 RUN sed -i 's#/usr/bin/python#/usr/bin/python2#g' /usr/bin/yum /usr/libexec/urlgrabber-ext-down
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel-mpi-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel-mpi-horovod-jupyter.Dockerfile
index 10d9c54ff8b7a6..6e8b363832d4bf 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel-mpi-horovod-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel-mpi-horovod-jupyter.Dockerfile
@@ -83,7 +83,7 @@ RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
 RUN sed -i 's#/usr/bin/python#/usr/bin/python2#g' /usr/bin/yum /usr/libexec/urlgrabber-ext-down
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel-mpi-horovod.Dockerfile
index f5af16c9921d18..9717fb9c526980 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel-mpi-horovod.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel-mpi-horovod.Dockerfile
@@ -83,7 +83,7 @@ RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
 RUN sed -i 's#/usr/bin/python#/usr/bin/python2#g' /usr/bin/yum /usr/libexec/urlgrabber-ext-down
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel-mpich-horovod-jupyter.Dockerfile
index d10bcbc3f3fee5..e875c58bd37e2d 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel-mpich-horovod-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel-mpich-horovod-jupyter.Dockerfile
@@ -83,7 +83,7 @@ RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
 RUN sed -i 's#/usr/bin/python#/usr/bin/python2#g' /usr/bin/yum /usr/libexec/urlgrabber-ext-down
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel-mpich-horovod.Dockerfile
index 606fcb0779c8f1..93f449a7e34e67 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel-mpich-horovod.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel-mpich-horovod.Dockerfile
@@ -83,7 +83,7 @@ RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
 RUN sed -i 's#/usr/bin/python#/usr/bin/python2#g' /usr/bin/yum /usr/libexec/urlgrabber-ext-down
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel.Dockerfile
index 2c83920349d541..f49ec6918ccb57 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-7-devel.Dockerfile
@@ -83,7 +83,7 @@ RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
 RUN sed -i 's#/usr/bin/python#/usr/bin/python2#g' /usr/bin/yum /usr/libexec/urlgrabber-ext-down
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel-jupyter.Dockerfile
index fed78cc2d77d67..393272db61d53c 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel-jupyter.Dockerfile
@@ -83,7 +83,7 @@ RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
     ln -sf $(which ${PYTHON}) /usr/bin/python
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel-mpi-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel-mpi-horovod-jupyter.Dockerfile
index eb61e7cf3b4ddf..2b1edbdc798be3 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel-mpi-horovod-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel-mpi-horovod-jupyter.Dockerfile
@@ -83,7 +83,7 @@ RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
     ln -sf $(which ${PYTHON}) /usr/bin/python
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel-mpi-horovod.Dockerfile
index a7cd7e9e0ef817..6cc4ac01cc9634 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel-mpi-horovod.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel-mpi-horovod.Dockerfile
@@ -83,7 +83,7 @@ RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
     ln -sf $(which ${PYTHON}) /usr/bin/python
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel-mpich-horovod-jupyter.Dockerfile
index 8ffc7a36f3ed2e..1b663a9c7390c1 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel-mpich-horovod-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel-mpich-horovod-jupyter.Dockerfile
@@ -83,7 +83,7 @@ RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
     ln -sf $(which ${PYTHON}) /usr/bin/python
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel-mpich-horovod.Dockerfile
index 5b7cbd77814b7f..188c153cce28fb 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel-mpich-horovod.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel-mpich-horovod.Dockerfile
@@ -83,7 +83,7 @@ RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
     ln -sf $(which ${PYTHON}) /usr/bin/python
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel.Dockerfile
index abd9773c3bacef..6805a36c700e82 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/centos-8-devel.Dockerfile
@@ -83,7 +83,7 @@ RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
     ln -sf $(which ${PYTHON}) /usr/bin/python
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-jupyter.Dockerfile
index 1e7507261e9142..40642fea0599e8 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-jupyter.Dockerfile
@@ -84,7 +84,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     curl
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpi-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpi-horovod-jupyter.Dockerfile
index 957fcee8a77191..71f5f246eb8638 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpi-horovod-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpi-horovod-jupyter.Dockerfile
@@ -84,7 +84,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     curl
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpi-horovod.Dockerfile
index 8db14e4247f0fe..e91091b919faa2 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpi-horovod.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpi-horovod.Dockerfile
@@ -84,7 +84,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     curl
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpich-horovod-jupyter.Dockerfile
index 4d93eb5f1dfa04..91bc6ac271a345 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpich-horovod-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpich-horovod-jupyter.Dockerfile
@@ -84,7 +84,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     curl
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpich-horovod.Dockerfile
index aa8b1c8859c522..8ca3dfb9c2aa4c 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpich-horovod.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel-mpich-horovod.Dockerfile
@@ -84,7 +84,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     curl
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel.Dockerfile
index 76fa5052c8e45d..ed4bb0b5e09776 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-16.04-devel.Dockerfile
@@ -84,7 +84,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     curl
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-jupyter.Dockerfile
index 4992db22468d8d..0fb5ba94169806 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-jupyter.Dockerfile
@@ -73,7 +73,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     curl
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpi-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpi-horovod-jupyter.Dockerfile
index eeb716664f52dc..76f6865b768d89 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpi-horovod-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpi-horovod-jupyter.Dockerfile
@@ -73,7 +73,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     curl
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpi-horovod.Dockerfile
index 4cc9b3161214a9..9c619e693cb64c 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpi-horovod.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpi-horovod.Dockerfile
@@ -73,7 +73,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     curl
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpich-horovod-jupyter.Dockerfile
index fabfaf0344892a..4abbbb9b1d51fd 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpich-horovod-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpich-horovod-jupyter.Dockerfile
@@ -73,7 +73,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     curl
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpich-horovod.Dockerfile
index 9fd24387f6cb57..4b1819be3a4574 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpich-horovod.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel-mpich-horovod.Dockerfile
@@ -73,7 +73,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     curl
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel.Dockerfile
index 995bbc01b5bab7..6b531a74f6e19b 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-18.04-devel.Dockerfile
@@ -73,7 +73,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     curl
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-jupyter.Dockerfile
index 4992db22468d8d..0fb5ba94169806 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-jupyter.Dockerfile
@@ -73,7 +73,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     curl
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpi-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpi-horovod-jupyter.Dockerfile
index eeb716664f52dc..76f6865b768d89 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpi-horovod-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpi-horovod-jupyter.Dockerfile
@@ -73,7 +73,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     curl
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpi-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpi-horovod.Dockerfile
index 4cc9b3161214a9..9c619e693cb64c 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpi-horovod.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpi-horovod.Dockerfile
@@ -73,7 +73,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     curl
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpich-horovod-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpich-horovod-jupyter.Dockerfile
index fabfaf0344892a..4abbbb9b1d51fd 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpich-horovod-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpich-horovod-jupyter.Dockerfile
@@ -73,7 +73,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     curl
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpich-horovod.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpich-horovod.Dockerfile
index 9fd24387f6cb57..4b1819be3a4574 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpich-horovod.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel-mpich-horovod.Dockerfile
@@ -73,7 +73,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     curl
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel.Dockerfile
index 995bbc01b5bab7..6b531a74f6e19b 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/onednn/ubuntu-20.04-devel.Dockerfile
@@ -73,7 +73,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     curl
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
index 253ff253464076..375c6a14606900 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le-jupyter.Dockerfile
@@ -92,7 +92,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Build and install bazel
-ENV BAZEL_VERSION 3.7.2
+ENV BAZEL_VERSION 4.2.1
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
index 067b01fc72d39e..59e8bdd8638688 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-cpu-ppc64le.Dockerfile
@@ -92,7 +92,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Build and install bazel
-ENV BAZEL_VERSION 3.7.2
+ENV BAZEL_VERSION 4.2.1
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
index 2296e9abeb5558..f4af623803d81d 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile
@@ -133,7 +133,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Build and install bazel
-ENV BAZEL_VERSION 3.7.2
+ENV BAZEL_VERSION 4.2.1
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
index c4af3088b084b8..7f990cb8a9a9bd 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile
@@ -133,7 +133,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Build and install bazel
-ENV BAZEL_VERSION 3.7.2
+ENV BAZEL_VERSION 4.2.1
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/dockerfiles/partials/onednn/centos/bazel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/onednn/centos/bazel.partial.Dockerfile
index 99166007f66d48..8cd34cf771fef0 100644
--- a/tensorflow/tools/dockerfiles/partials/onednn/centos/bazel.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/onednn/centos/bazel.partial.Dockerfile
@@ -1,5 +1,5 @@
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/bazel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/bazel.partial.Dockerfile
index c1d597b8e169f1..7115434a81396c 100644
--- a/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/bazel.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/onednn/ubuntu/bazel.partial.Dockerfile
@@ -2,7 +2,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
     curl
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     curl -fSsL -o /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     curl -fSsL -o /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
index ac3d23c3747e81..4f5545d2cf5b3c 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
@@ -23,7 +23,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Install bazel
-ARG BAZEL_VERSION=3.7.2
+ARG BAZEL_VERSION=4.2.1
 RUN mkdir /bazel && \
     wget -O /bazel/installer.sh "https://github.com/bazelbuild/bazel/releases/download/${BAZEL_VERSION}/bazel-${BAZEL_VERSION}-installer-linux-x86_64.sh" && \
     wget -O /bazel/LICENSE.txt "https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE" && \
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile
index 969445c6274651..7be985c0e710d2 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild-arm64v8.partial.Dockerfile
@@ -26,7 +26,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Build and install bazel
-ENV BAZEL_VERSION 3.7.2
+ENV BAZEL_VERSION 4.2.1
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
index 0cf475d9deb13c..ed2d8d5ab49c9d 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazelbuild.partial.Dockerfile
@@ -21,7 +21,7 @@ RUN python3 -m pip --no-cache-dir install \
     enum34
 
 # Build and install bazel
-ENV BAZEL_VERSION 3.7.2
+ENV BAZEL_VERSION 4.2.1
 WORKDIR /
 RUN mkdir /bazel && \
     cd /bazel && \
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 36d279c52ce846..af27150da8cf67 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -64,6 +64,8 @@ py_test(
     deps = [
         ":tf_doctest_lib",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python:logging_ops",
+        "//tensorflow/python/distribute:distribute_lib",
         "//third_party/py/numpy",
         "@absl_py//absl/flags",
         "@absl_py//absl/testing:absltest",
@@ -114,6 +116,8 @@ py_test(
     deps = [
         ":tf_doctest_lib",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python:logging_ops",
+        "//tensorflow/python/distribute:distribute_lib",
         "//third_party/py/numpy",
         "@absl_py//absl/flags",
         "@absl_py//absl/testing:absltest",
@@ -138,6 +142,33 @@ py_test(
     ],
 )
 
+py_library(
+    name = "fenced_doctest_lib",
+    srcs = ["fenced_doctest_lib.py"],
+    deps = [
+        ":tf_doctest_lib",
+        "@astor_archive//:astor",
+    ],
+)
+
+py_test(
+    name = "fenced_doctest_test",
+    srcs = ["fenced_doctest_test.py"],
+    data = glob(["**/create_model.md"]),
+    tags = [
+        "no_oss",
+        "no_pip",
+        "noasan",
+        "nomsan",
+        "notsan",
+    ],
+    deps = [
+        ":fenced_doctest_lib",
+        "@absl_py//absl/testing:absltest",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_library(
     name = "doc_controls",
     srcs = ["doc_controls.py"],
diff --git a/tensorflow/tools/docs/fenced_doctest_lib.py b/tensorflow/tools/docs/fenced_doctest_lib.py
new file mode 100644
index 00000000000000..6e992f3fffa99a
--- /dev/null
+++ b/tensorflow/tools/docs/fenced_doctest_lib.py
@@ -0,0 +1,240 @@
+# Lint as: python3
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Run doctests for tensorflow."""
+
+import ast
+import doctest
+import os
+import re
+import textwrap
+from typing import Any, Callable, Dict, Iterable, Optional
+
+import astor
+
+from tensorflow.tools.docs import tf_doctest_lib
+
+
+def load_from_files(
+    files,
+    globs: Optional[Dict[str, Any]] = None,
+    set_up: Optional[Callable[[Any], None]] = None,
+    tear_down: Optional[Callable[[Any], None]] = None) -> doctest.DocFileSuite:
+  """Creates a doctest suite from the the files list.
+
+  Args:
+    files: A list of file paths to test.
+    globs: The global namespace the tests are run in.
+    set_up: Run before each test, recieves the test as argument.
+    tear_down: Run after each test, recieves the test as argument.
+
+  Returns:
+    A DocFileSuite containing the tests.
+  """
+  if globs is None:
+    globs = {}
+
+  # __fspath__ isn't respected everywhere in doctest so convert paths to
+  # strings.
+  files = [os.fspath(f) for f in files]
+
+  globs['_print_if_not_none'] = _print_if_not_none
+  # Ref: https://docs.python.org/3/library/doctest.html#doctest.DocFileSuite
+  return doctest.DocFileSuite(
+      *files,
+      module_relative=False,
+      parser=FencedCellParser(fence_label='python'),
+      globs=globs,
+      setUp=set_up,
+      tearDown=tear_down,
+      checker=FencedCellOutputChecker(),
+      optionflags=(doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE
+                   | doctest.IGNORE_EXCEPTION_DETAIL
+                   | doctest.DONT_ACCEPT_BLANKLINE),
+  )
+
+
+class FencedCellOutputChecker(tf_doctest_lib.TfDoctestOutputChecker):
+  """TfDoctestChecker with a different warning message."""
+  MESSAGE = textwrap.dedent("""\n
+        ##############################################################
+        # Check the documentation (go/g3doctest) on how to write
+        # testable g3docs.
+        ##############################################################
+        """)
+
+
+class FencedCellParser(doctest.DocTestParser):
+  """Implements test parsing for ``` fenced cells.
+
+  https://docs.python.org/3/library/doctest.html#doctestparser-objects
+
+  The `get_examples` method recieves a string and returns an
+  iterable of `doctest.Example` objects.
+  """
+  patched = False
+
+  def __init__(self, fence_label='python'):
+    super().__init__()
+
+    if not self.patched:
+      # The default doctest compiles in "single" mode. The fenced block may
+      # contain multiple statements. The `_patch_compile` function fixes the
+      # compile mode.
+      doctest.compile = _patch_compile
+      print(
+          textwrap.dedent("""
+          *********************************************************************
+          * Caution: `fenced_doctest` patches `doctest.compile` don't use this
+          *   in the same binary as any other doctests.
+          *********************************************************************
+          """))
+      type(self).patched = True
+
+    # Match anything, except if the look-behind sees a closing fence.
+    no_fence = '(.(?<!```))*?'
+    self.fence_cell_re = re.compile(
+        rf"""
+        ^(                             # After a newline
+            \s*```\s*({fence_label})\n   # Open a labeled ``` fence
+            (?P<doctest>{no_fence})      # Match anything except a closing fence
+            \n\s*```\s*(\n|$)            # Close the fence.
+        )
+        (                              # Optional!
+            [\s\n]*                      # Any number of blank lines.
+            ```\s*\n                     # Open ```
+            (?P<output>{no_fence})       # Anything except a closing fence
+            \n\s*```                     # Close the fence.
+        )?
+        """,
+        # Multiline so ^ matches after a newline
+        re.MULTILINE |
+        # Dotall so `.` matches newlines.
+        re.DOTALL |
+        # Verbose to allow comments/ignore-whitespace.
+        re.VERBOSE)
+
+  def get_examples(self,
+                   string: str,
+                   name: str = '<string>') -> Iterable[doctest.Example]:
+    # Check for a file-level skip comment.
+    if re.search('<!--.*?doctest.*?skip.*?all.*?-->', string, re.IGNORECASE):
+      return
+
+    for match in self.fence_cell_re.finditer(string):
+      if re.search('doctest.*skip', match.group(0), re.IGNORECASE):
+        continue
+
+      groups = match.groupdict()
+
+      source = textwrap.dedent(groups['doctest'])
+      want = groups['output']
+      if want is not None:
+        want = textwrap.dedent(want)
+
+      yield doctest.Example(
+          lineno=string[:match.start()].count('\n') + 1,
+          source=source,
+          want=want)
+
+
+def _print_if_not_none(obj):
+  """Print like a notebook: Show the repr if the object is not None.
+
+  `_patch_compile` Uses this on the final expression in each cell.
+
+  This way the outputs feel like notebooks.
+
+  Args:
+    obj: the object to print.
+  """
+  if obj is not None:
+    print(repr(obj))
+
+
+def _patch_compile(source,
+                   filename,
+                   mode,
+                   flags=0,
+                   dont_inherit=False,
+                   optimize=-1):
+  """Patch `doctest.compile` to make doctest to behave like a notebook.
+
+  Default settings for doctest are configured to run like a repl: one statement
+  at a time. The doctest source uses `compile(..., mode="single")`
+
+  So to let doctest act like a notebook:
+
+  1. We need `mode="exec"` (easy)
+  2. We need the last expression to be printed (harder).
+
+  To print the last expression, just wrap the last expression in
+  `_print_if_not_none(expr)`. To detect the last expression use `AST`.
+  if the last node is an expression modify the ast to to call
+  `_print_if_not_none` on it, convert the ast back to source and compile that.
+
+  https://docs.python.org/3/library/functions.html#compile
+
+  Args:
+    source: Can either be a normal string, a byte string, or an AST object.
+    filename: Argument should give the file from which the code was read; pass
+      some recognizable value if it wasn’t read from a file ('<string>' is
+      commonly used).
+    mode: [Ignored] always use exec.
+    flags: Compiler options.
+    dont_inherit: Compiler options.
+    optimize: Compiler options.
+
+  Returns:
+    The resulting code object.
+  """
+  # doctest passes some dummy string as the file name, AFAICT
+  # but tf.function freaks-out if this doesn't look like a
+  # python file name.
+  del filename
+  # Doctest always passes "single" here, you need exec for multiple lines.
+  del mode
+
+  source_ast = ast.parse(source)
+
+  final = source_ast.body[-1]
+  if isinstance(final, ast.Expr):
+    # Wrap the final expression as `_print_if_not_none(expr)`
+    print_it = ast.Expr(
+        lineno=-1,
+        col_offset=-1,
+        value=ast.Call(
+            func=ast.Name(
+                id='_print_if_not_none',
+                ctx=ast.Load(),
+                lineno=-1,
+                col_offset=-1),
+            lineno=-1,
+            col_offset=-1,
+            args=[final],  # wrap the final Expression
+            keywords=[]))
+    source_ast.body[-1] = print_it
+
+    # It's not clear why this step is necessary. `compile` is supposed to handle
+    # AST directly.
+    source = astor.to_source(source_ast)
+
+  return compile(
+      source,
+      filename='dummy.py',
+      mode='exec',
+      flags=flags,
+      dont_inherit=dont_inherit,
+      optimize=optimize)
diff --git a/tensorflow/tools/docs/fenced_doctest_test.py b/tensorflow/tools/docs/fenced_doctest_test.py
new file mode 100644
index 00000000000000..4878b3c031a836
--- /dev/null
+++ b/tensorflow/tools/docs/fenced_doctest_test.py
@@ -0,0 +1,241 @@
+# Lint as: python3
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for fenced_doctest."""
+from typing import List, Optional, Tuple
+
+from absl.testing import absltest
+from absl.testing import parameterized
+
+from tensorflow.tools.docs import fenced_doctest_lib
+
+EXAMPLES = [
+    # pyformat: disable
+    ('simple', [('code', None)], """
+     Hello
+
+     ``` python
+     code
+     ```
+
+     Goodbye
+     """),
+    ('output', [('code', 'result')], """
+     Hello
+
+     ``` python
+     code
+     ```
+
+     ```
+     result
+     ```
+
+     Goodbye
+     """),
+    ('not-output', [('code', None)], """
+     Hello
+
+     ``` python
+     code
+     ```
+
+     ``` bash
+     result
+     ```
+
+     Goodbye
+     """),
+    ('first', [('code', None)], """
+     ``` python
+     code
+     ```
+
+     Goodbye
+     """[1:]),
+    ('last', [('code', None)], """
+     Hello
+
+     ``` python
+     code
+     ```"""),
+    ('last_output', [('code', 'result')], """
+     Hello
+
+     ``` python
+     code
+     ```
+
+     ```
+     result
+     ```"""),
+    ('skip-unlabeled', [], """
+     Hello
+
+     ```
+     skip
+     ```
+
+     Goodbye
+     """),
+    ('skip-wrong-label', [], """
+     Hello
+
+     ``` sdkfjgsd
+     skip
+     ```
+
+     Goodbye
+     """),
+    ('doctest_skip', [], """
+     Hello
+
+     ``` python
+     doctest: +SKIP
+     ```
+
+     Goodbye
+     """),
+    ('skip_all', [], """
+     <!-- doctest: skip-all -->
+
+     Hello
+
+     ``` python
+     a
+     ```
+
+     ``` python
+     b
+     ```
+
+     Goodbye
+     """),
+    ('two', [('a', None), ('b', None)], """
+     Hello
+
+     ``` python
+     a
+     ```
+
+     ``` python
+     b
+     ```
+
+     Goodbye
+     """),
+    ('two-outputs', [('a', 'A'), ('b', 'B')], """
+     Hello
+
+     ``` python
+     a
+     ```
+
+     ```
+     A
+     ```
+
+     ``` python
+     b
+     ```
+
+     ```
+     B
+     ```
+
+     Goodbye
+     """),
+    ('list', [('a', None), ('b', 'B'), ('c', 'C'), ('d', None)], """
+     Hello
+
+     ``` python
+     a
+     ```
+
+     ``` python
+     b
+     ```
+
+     ```
+     B
+     ```
+
+     List:
+     * first
+
+       ``` python
+       c
+       ```
+
+       ```
+       C
+       ```
+
+       ``` python
+       d
+       ```
+     * second
+
+
+     Goodbye
+     """),
+    ('multiline', [('a\nb', 'A\nB')], """
+     Hello
+
+     ``` python
+     a
+     b
+     ```
+
+     ```
+     A
+     B
+     ```
+
+     Goodbye
+     """)
+]
+
+ExampleTuples = List[Tuple[str, Optional[str]]]
+
+
+class G3DoctestTest(parameterized.TestCase):
+
+  def _do_test(self, expected_example_tuples, string):
+    parser = fenced_doctest_lib.FencedCellParser(fence_label='python')
+
+    example_tuples = []
+    for example in parser.get_examples(string, name=self._testMethodName):
+      source = example.source.rstrip('\n')
+      want = example.want
+      if want is not None:
+        want = want.rstrip('\n')
+      example_tuples.append((source, want))
+
+    self.assertEqual(expected_example_tuples, example_tuples)
+
+  @parameterized.named_parameters(*EXAMPLES)
+  def test_parser(self, expected_example_tuples: ExampleTuples, string: str):
+    self._do_test(expected_example_tuples, string)
+
+  @parameterized.named_parameters(*EXAMPLES)
+  def test_parser_no_blanks(self, expected_example_tuples: ExampleTuples,
+                            string: str):
+    string = string.replace('\n\n', '\n')
+    self._do_test(expected_example_tuples, string)
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/tensorflow/tools/docs/tf_doctest_lib.py b/tensorflow/tools/docs/tf_doctest_lib.py
index 1929b982305201..023c3c9198f029 100644
--- a/tensorflow/tools/docs/tf_doctest_lib.py
+++ b/tensorflow/tools/docs/tf_doctest_lib.py
@@ -98,10 +98,7 @@ def __call__(self, string):
 
 
 class TfDoctestOutputChecker(doctest.OutputChecker, object):
-  """Changes the `want` and `got` strings.
-
-  This allows it to be customized before they are compared.
-  """
+  """Customizes how `want` and `got` are compared, see `check_output`."""
 
   def __init__(self, *args, **kwargs):
     super(TfDoctestOutputChecker, self).__init__(*args, **kwargs)
@@ -121,6 +118,12 @@ def _tf_tensor_numpy_output(self, string):
     modified_string = self._NUMPY_OUTPUT_RE.sub(r'\1', string)
     return modified_string, modified_string != string
 
+  MESSAGE = textwrap.dedent("""\n
+        #############################################################
+        Check the documentation (https://www.tensorflow.org/community/contribute/docs_ref) on how to
+        write testable docstrings.
+        #############################################################""")
+
   def check_output(self, want, got, optionflags):
     """Compares the docstring output to the output gotten by running the code.
 
@@ -152,9 +155,12 @@ def check_output(self, want, got, optionflags):
     # If the docstring's output is empty and there is some output generated
     # after running the snippet, return True. This is because if the user
     # doesn't want to display output, respect that over what the doctest wants.
-    if not want and got:
+    if got and not want:
       return True
 
+    if want is None:
+      want = ''
+
     # Replace python's addresses with ellipsis (`...`) since it can change on
     # each execution.
     want = self._ADDRESS_RE.sub('at ...>', want)
@@ -201,13 +207,7 @@ def output_difference(self, example, got, optionflags):
         got.append("\n\nCAUTION: tf_doctest doesn't work if *some* of the "
                    "*float output* is hidden with a \"...\".")
 
-    message = textwrap.dedent("""\n
-        #############################################################
-        Check the documentation
-        (https://www.tensorflow.org/community/contribute/docs_ref) on how to write testable docstrings.
-        #############################################################""")
-
-    got.append(message)
+    got.append(self.MESSAGE)
     got = '\n'.join(got)
     return (super(TfDoctestOutputChecker,
                   self).output_difference(example, got, optionflags))
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 6c0b2d14ff18b5..9b81a1f887d5ec 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -139,7 +139,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/kernel_tests/random:util",
     "//tensorflow/python/kernel_tests/signal:test_util",
     "//tensorflow/python/kernel_tests/sparse_ops:sparse_xent_op_test_base",
-    "//tensorflow/python/profiler:traceme",
+    "//tensorflow/python/profiler:trace",
     "//tensorflow/python/saved_model:saved_model",
     "//tensorflow/python/tools:tools_pip",
     "//tensorflow/python/tools/api/generator:create_python_api",
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index b22e09d72a36f8..f7cd51cad30194 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -44,7 +44,6 @@ load("//third_party/sobol_data:workspace.bzl", sobol_data = "repo")
 load("//third_party/vulkan_headers:workspace.bzl", vulkan_headers = "repo")
 
 # Import external repository rules.
-load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_file")
 load("@bazel_tools//tools/build_defs/repo:java.bzl", "java_import_external")
 load("@io_bazel_rules_closure//closure:defs.bzl", "filegroup_external")
 load("@tf_runtime//:dependencies.bzl", "tfrt_dependencies")
@@ -134,9 +133,9 @@ def _tf_repositories():
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "aa775069fcc7086ce691ab57d6c124187e607117ef1ca33f441354a78dfbfe02",
-        strip_prefix = "XNNPACK-b4cde5aebb7676fc85825dab737a6d0dc60a0e23",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/b4cde5aebb7676fc85825dab737a6d0dc60a0e23.zip"),
+        sha256 = "e3dcbb9278a44532ada93dad3b5fcd90d74bf18353d04dfa8868df8029bff693",
+        strip_prefix = "XNNPACK-0d6a1194ff36f59c79089336d7a3b446c3c6a39d",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/0d6a1194ff36f59c79089336d7a3b446c3c6a39d.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 
@@ -799,40 +798,38 @@ def _tf_repositories():
     # https://github.com/bazelbuild/rules_apple/releases
     tf_http_archive(
         name = "build_bazel_rules_apple",
-        sha256 = "ee9e6073aeb5a65c100cb9c44b0017c937706a4ae03176e14a7e78620a198079",
-        strip_prefix = "rules_apple-5131f3d46794bf227d296c82f30c2499c9de3c5b",
-        urls = tf_mirror_urls("https://github.com/bazelbuild/rules_apple/archive/5131f3d46794bf227d296c82f30c2499c9de3c5b.tar.gz"),
+        sha256 = "0052d452af7742c8f3a4e0929763388a66403de363775db7e90adecb2ba4944b",
+        urls = tf_mirror_urls("https://github.com/bazelbuild/rules_apple/releases/download/0.31.3/rules_apple.0.31.3.tar.gz"),
     )
 
     # https://github.com/bazelbuild/rules_swift/releases
     tf_http_archive(
         name = "build_bazel_rules_swift",
-        sha256 = "d0833bc6dad817a367936a5f902a0c11318160b5e80a20ece35fb85a5675c886",
-        strip_prefix = "rules_swift-3eeeb53cebda55b349d64c9fc144e18c5f7c0eb8",
-        urls = tf_mirror_urls("https://github.com/bazelbuild/rules_swift/archive/3eeeb53cebda55b349d64c9fc144e18c5f7c0eb8.tar.gz"),
+        sha256 = "8407fa0fd04a7ce1d6bb95e90b216404466f809eda459c23cb57b5fa1ef9d639",
+        urls = tf_mirror_urls("https://github.com/bazelbuild/rules_swift/releases/download/0.21.0/rules_swift.0.21.0.tar.gz"),
     )
 
     # https://github.com/bazelbuild/apple_support/releases
     tf_http_archive(
         name = "build_bazel_apple_support",
-        sha256 = "ad8ae80e93612b8151019367a3d1604d7a51c14480dae1254e10252007e8260c",
-        strip_prefix = "apple_support-501b4afb27745c4813a88ffa28acd901408014e4",
-        urls = tf_mirror_urls("https://github.com/bazelbuild/apple_support/archive/501b4afb27745c4813a88ffa28acd901408014e4.tar.gz"),
+        sha256 = "741366f79d900c11e11d8efd6cc6c66a31bfb2451178b58e0b5edc6f1db17b35",
+        urls = tf_mirror_urls("https://github.com/bazelbuild/apple_support/releases/download/0.10.0/apple_support.0.10.0.tar.gz"),
     )
 
     # https://github.com/apple/swift-protobuf/releases
     tf_http_archive(
         name = "com_github_apple_swift_swift_protobuf",
-        strip_prefix = "swift-protobuf-1.6.0/",
-        sha256 = "4ccf6e5ea558e8287bf6331f9f6e52b3c321fca5f1d181d03680f415c32a6bba",
-        urls = tf_mirror_urls("https://github.com/apple/swift-protobuf/archive/1.6.0.zip"),
+        strip_prefix = "swift-protobuf-1.18.0/",
+        sha256 = "6b96d07bfbfa1334909eeb1430c69a93af71c961695b0a5f3536d087a58d8e41",
+        urls = tf_mirror_urls("https://github.com/apple/swift-protobuf/archive/1.18.0.zip"),
     )
 
     # https://github.com/google/xctestrunner/releases
-    http_file(
+    tf_http_archive(
         name = "xctestrunner",
-        executable = 1,
-        urls = tf_mirror_urls("https://github.com/google/xctestrunner/releases/download/0.2.9/ios_test_runner.par"),
+        strip_prefix = "xctestrunner-0.2.15",
+        sha256 = "b789cf18037c8c28d17365f14925f83b93b1f7dabcabb80333ae4331cf0bcb2f",
+        urls = tf_mirror_urls("https://github.com/google/xctestrunner/archive/refs/tags/0.2.15.tar.gz"),
     )
 
     tf_http_archive(
diff --git a/tensorflow/workspace3.bzl b/tensorflow/workspace3.bzl
index 640e17332e1d67..09b2d030da4bf1 100644
--- a/tensorflow/workspace3.bzl
+++ b/tensorflow/workspace3.bzl
@@ -17,11 +17,11 @@ def workspace():
 
     http_archive(
         name = "tf_toolchains",
-        sha256 = "d72b2e52baf0592f5b94347b128ef75422fc22f63dfcf2d5fd46bc732cab052b",
-        strip_prefix = "toolchains-1.3.0",
+        sha256 = "e5bf20273961b3e47967a3a3a7812add55ad4732a4efbbdf045c273541599bd0",
+        strip_prefix = "toolchains-1.3.2",
         urls = [
-            "http://mirror.tensorflow.org/github.com/tensorflow/toolchains/archive/v1.3.0.tar.gz",
-            "https://github.com/tensorflow/toolchains/archive/v1.3.0.tar.gz",
+            "http://mirror.tensorflow.org/github.com/tensorflow/toolchains/archive/v1.3.2.tar.gz",
+            "https://github.com/tensorflow/toolchains/archive/v1.3.2.tar.gz",
         ],
     )
 
diff --git a/third_party/compute_library/LICENSE b/third_party/compute_library/LICENSE
deleted file mode 100644
index 1bb90563e9478e..00000000000000
--- a/third_party/compute_library/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2017-2021 Arm Limited
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/third_party/eigen3/workspace.bzl b/third_party/eigen3/workspace.bzl
index f004e6bd998cb1..9782907cf5e818 100644
--- a/third_party/eigen3/workspace.bzl
+++ b/third_party/eigen3/workspace.bzl
@@ -7,8 +7,8 @@ def repo():
 
     # Attention: tools parse and update these lines.
     # LINT.IfChange
-    EIGEN_COMMIT = "cfdb3ce3f018166a2cb0bfa8b18599c914bf447e"
-    EIGEN_SHA256 = "c69a5a803d2ae2e4e15a5f6e76bd15f21b416c0d21d70336cca4f27b08a38cf5"
+    EIGEN_COMMIT = "085c2fc5d53f391afcccce21c45e15f61c827ab1"
+    EIGEN_SHA256 = "cd72f0a56a95d85cb8a0160f4adc7fea72da49fbb7351ebb31c4e67e1a5fc8bd"
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/eigen.cmake)
 
     tf_http_archive(
diff --git a/third_party/hexagon/BUILD b/third_party/hexagon/BUILD
index 719cc9100e7b19..696a9c59b0c001 100644
--- a/third_party/hexagon/BUILD
+++ b/third_party/hexagon/BUILD
@@ -27,7 +27,7 @@ exports_files(glob(["hexagon/**/*.so"]))
 cc_library(
     name = "hexagon_nn_header",
     hdrs = [
-        "hexagon/hexagon_nn.h",
+        "@hexagon_nn//:hexagon/hexagon_nn.h",
     ],
     tags = [
         "manual",
@@ -38,11 +38,100 @@ cc_library(
 cc_library(
     name = "hexagon_nn_ops",
     hdrs = [
-        "hexagon/hexagon_nn_ops.h",
-        "hexagon/ops.def",
+        "@hexagon_nn//:hexagon/hexagon_nn_ops.h",
+        "@hexagon_nn//:hexagon/ops.def",
     ],
     tags = [
         "manual",
         "nobuilder",
     ],
 )
+
+cc_library(
+    name = "remote",
+    hdrs = [
+        "@hexagon_nn//:hexagon/remote.h",
+        "@hexagon_nn//:hexagon/remote64.h",
+    ],
+    tags = [
+        "manual",
+        "nobuilder",
+    ],
+)
+
+cc_library(
+    name = "rpcmem",
+    srcs = [
+        "@hexagon_nn//:hexagon/rpcmem_stub.c",
+    ],
+    hdrs = [
+        "@hexagon_nn//:hexagon/rpcmem.h",
+    ],
+    deps = [
+        ":AEEStdDef",
+    ],
+)
+
+cc_library(
+    name = "hexagon_soc",
+    hdrs = [
+        "@hexagon_nn//:hexagon/hexnn_soc_defines.h",
+    ],
+    tags = [
+        "manual",
+        "nobuilder",
+    ],
+)
+
+cc_library(
+    name = "AEEStdDef",
+    hdrs = [
+        "@hexagon_nn//:hexagon/AEEStdDef.h",
+    ],
+    tags = [
+        "manual",
+        "nobuilder",
+    ],
+)
+
+cc_library(
+    name = "AEEStdErr",
+    hdrs = [
+        "@hexagon_nn//:hexagon/AEEStdErr.h",
+    ],
+)
+
+# The files are included in another .c files, so we add the src files as textual_hdrs
+# to avoid compiling them and have linking errors for multiple symbols.
+cc_library(
+    name = "hexagon_stub",
+    textual_hdrs = [
+        "@hexagon_nn//:hexagon/hexagon_nn_domains_stub.c",
+        "@hexagon_nn//:hexagon/hexagon_nn_stub.c",
+    ],
+)
+
+# This rule uses the smart/graph wrapper interfaces.
+cc_library(
+    name = "hexagon_nn",
+    srcs = [
+        "@hexagon_nn//:hexagon/hexagon_nn.h",
+        "@hexagon_nn//:hexagon/hexnn_dsp_api.h",
+        "@hexagon_nn//:hexagon/hexnn_dsp_api_impl.c",
+        "@hexagon_nn//:hexagon/hexnn_dsp_domains_api.h",
+        "@hexagon_nn//:hexagon/hexnn_dsp_domains_api_impl.c",
+        "@hexagon_nn//:hexagon/hexnn_dsp_smart_wrapper_api.c",
+        "@hexagon_nn//:hexagon/hexnn_dsp_smart_wrapper_api.h",
+        "@hexagon_nn//:hexagon/hexnn_graph_wrapper.cpp",
+        "@hexagon_nn//:hexagon/hexnn_graph_wrapper.hpp",
+        "@hexagon_nn//:hexagon/hexnn_graph_wrapper_interface.h",
+        "@hexagon_nn//:hexagon/hexnn_soc_defines.h",
+    ],
+    deps = [
+        ":AEEStdErr",
+        ":hexagon_stub",
+        ":remote",
+        ":rpcmem",
+    ],
+    alwayslink = 1,
+)
diff --git a/third_party/hexagon/LICENSE b/third_party/hexagon/LICENSE
deleted file mode 100644
index 624e16e8fbf1c4..00000000000000
--- a/third_party/hexagon/LICENSE
+++ /dev/null
@@ -1,35 +0,0 @@
-
-/*
- * Copyright (c) 2016-2019, The Linux Foundation. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted (subject to the limitations in the
- * disclaimer below) provided that the following conditions are met:
- *
- *    * Redistributions of source code must retain the above copyright
- *      notice, this list of conditions and the following disclaimer.
- *
- *    * Redistributions in binary form must reproduce the above
- *      copyright notice, this list of conditions and the following
- *      disclaimer in the documentation and/or other materials provided
- *      with the distribution.
- *
- *    * Neither the name of The Linux Foundation nor the names of its
- *      contributors may be used to endorse or promote products derived
- *      from this software without specific prior written permission.
- *
- * NO EXPRESS OR IMPLIED LICENSES TO ANY PARTY'S PATENT RIGHTS ARE
- * GRANTED BY THIS LICENSE. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
- * HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
- * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
- * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
- * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
- * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- *
- */
diff --git a/third_party/hexagon/workspace.bzl b/third_party/hexagon/workspace.bzl
index 016b2831ce36de..614c13960f7a92 100644
--- a/third_party/hexagon/workspace.bzl
+++ b/third_party/hexagon/workspace.bzl
@@ -7,11 +7,11 @@ load("//third_party:repo.bzl", "tf_http_archive")
 def repo():
     tf_http_archive(
         name = "hexagon_nn",
-        sha256 = "6eaf6d8eabfcb3486753c68f22fd6c1eabf8fae28bf52e4ebea815e9daf67257",
+        sha256 = "f577b4c150b72e11e9dfb3f9d14f9772ba8fe460f7d65c84a7327ea9bef44d8e",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/storage.cloud.google.com/download.tensorflow.org/tflite/hexagon_nn_headers_v1.20.0.3.tgz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/storage.cloud.google.com/download.tensorflow.org/tflite/hexagon_nn_headers_v1.20.0.9.tgz",
             # Repeated to bypass 'at least two urls' check. TODO(karimnosseir): add original source of this package.
-            "https://storage.googleapis.com/mirror.tensorflow.org/storage.cloud.google.com/download.tensorflow.org/tflite/hexagon_nn_headers_v1.20.0.3.tgz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/storage.cloud.google.com/download.tensorflow.org/tflite/hexagon_nn_headers_v1.20.0.9.tgz",
         ],
         build_file = "//third_party/hexagon:BUILD",
     )
diff --git a/third_party/llvm/setup.bzl b/third_party/llvm/setup.bzl
index b9aaeef03ed4c0..e9b4270f8ea328 100644
--- a/third_party/llvm/setup.bzl
+++ b/third_party/llvm/setup.bzl
@@ -2,6 +2,18 @@
 
 load("@llvm-raw//utils/bazel:configure.bzl", "llvm_configure", "llvm_disable_optional_support_deps")
 
+# The subset of LLVM targets that TensorFlow cares about.
+_LLVM_TARGETS = [
+    "AArch64",
+    "AMDGPU",
+    "ARM",
+    "NVPTX",
+    "PowerPC",
+    "RISCV",
+    "SystemZ",
+    "X86",
+]
+
 def llvm_setup(name):
     # Disable terminfo and zlib that are bundled with LLVM.
     llvm_disable_optional_support_deps()
@@ -10,4 +22,5 @@ def llvm_setup(name):
     llvm_configure(
         name = name,
         repo_mapping = {"@python_runtime": "@local_config_python"},
+        targets = _LLVM_TARGETS,
     )
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index e790fa9c26a0e9..cfd4d4dc939843 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "b927aa69bf2fd50ecf33e3f5ec853eb3c70312c5"
-    LLVM_SHA256 = "0b7e47bd9a62b7c14d6f4c4b1cf8fdb2fba5996e6b24917deb07d9f636f7ce36"
+    LLVM_COMMIT = "311dd55c9eb9342b1c889f6db7728f15b05378bb"
+    LLVM_SHA256 = "04c17432764a4ccf9ef114e32c376eb197798bffe27b338c2354339bb5b810a7"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/protobuf/protobuf.patch b/third_party/protobuf/protobuf.patch
index c1b57588427fd4..57ba8ddaa5db54 100644
--- a/third_party/protobuf/protobuf.patch
+++ b/third_party/protobuf/protobuf.patch
@@ -1,5 +1,5 @@
 diff --git a/BUILD b/BUILD
-index dbae719..87dc384 100644
+index dbae719ff..4e276c854 100644
 --- a/BUILD
 +++ b/BUILD
 @@ -23,7 +23,7 @@ config_setting(
@@ -11,7 +11,15 @@ index dbae719..87dc384 100644
  
  ################################################################################
  # Protobuf Runtime Library
-@@ -143,6 +143,7 @@ cc_library(
+@@ -100,6 +100,7 @@ LINK_OPTS = select({
+ 
+ load(
+     ":protobuf.bzl",
++    "adapt_proto_library",
+     "cc_proto_library",
+     "py_proto_library",
+     "internal_copied_filegroup",
+@@ -143,6 +144,7 @@ cc_library(
      copts = COPTS,
      includes = ["src/"],
      linkopts = LINK_OPTS,
@@ -19,7 +27,7 @@ index dbae719..87dc384 100644
      visibility = ["//visibility:public"],
  )
  
-@@ -213,6 +214,7 @@ cc_library(
+@@ -213,6 +215,7 @@ cc_library(
      copts = COPTS,
      includes = ["src/"],
      linkopts = LINK_OPTS,
@@ -27,11 +35,51 @@ index dbae719..87dc384 100644
      visibility = ["//visibility:public"],
      deps = [":protobuf_lite"] + PROTOBUF_DEPS,
  )
+@@ -255,13 +258,15 @@ filegroup(
+     visibility = ["//visibility:public"],
+ )
+ 
+-cc_proto_library(
++adapt_proto_library(
++    name = "cc_wkt_protos_genproto",
++    deps = [proto + "_proto" for proto in WELL_KNOWN_PROTO_MAP.keys()],
++    visibility = ["//visibility:public"],
++)
++
++cc_library(
+     name = "cc_wkt_protos",
+-    srcs = WELL_KNOWN_PROTOS,
+-    include = "src",
+-    default_runtime = ":protobuf",
+-    internal_bootstrap_hack = 1,
+-    protoc = ":protoc",
++    deprecation = "Only for backward compatibility. Do not use.",
+     visibility = ["//visibility:public"],
+ )
+ 
+@@ -978,10 +983,10 @@ cc_library(
+ 
+ proto_lang_toolchain(
+     name = "cc_toolchain",
++    blacklisted_protos = [proto + "_proto" for proto in WELL_KNOWN_PROTO_MAP.keys()],
+     command_line = "--cpp_out=$(OUT)",
+     runtime = ":protobuf",
+     visibility = ["//visibility:public"],
+-    blacklisted_protos = [":_internal_wkt_protos_genrule"],
+ )
+ 
+ proto_lang_toolchain(
 diff --git a/protobuf.bzl b/protobuf.bzl
-index e065332..92ae3b4 100644
+index e0653321f..4156a1275 100644
 --- a/protobuf.bzl
 +++ b/protobuf.bzl
-@@ -85,6 +85,8 @@ def _proto_gen_impl(ctx):
+@@ -1,4 +1,5 @@
+ load("@bazel_skylib//lib:versions.bzl", "versions")
++load("@rules_proto//proto:defs.bzl", "ProtoInfo")
+ 
+ def _GetPath(ctx, path):
+     if ctx.label.workspace_root:
+@@ -85,6 +86,8 @@ def _proto_gen_impl(ctx):
      for dep in ctx.attr.deps:
          import_flags += dep.proto.import_flags
          deps += dep.proto.deps
@@ -40,8 +88,88 @@ index e065332..92ae3b4 100644
  
      if not ctx.attr.gen_cc and not ctx.attr.gen_py and not ctx.executable.plugin:
          return struct(
+@@ -222,6 +225,29 @@ Args:
+   outs: a list of labels of the expected outputs from the protocol compiler.
+ """
+ 
++def _adapt_proto_library_impl(ctx):
++    deps = [dep[ProtoInfo] for dep in ctx.attr.deps]
++
++    srcs = [src for dep in deps for src in dep.direct_sources]
++    return struct(
++        proto = struct(
++            srcs = srcs,
++            import_flags = ["-I{}".format(path) for dep in deps for path in dep.transitive_proto_path.to_list()],
++            deps = srcs,
++        ),
++    )
++
++adapt_proto_library = rule(
++    implementation = _adapt_proto_library_impl,
++    attrs = {
++        "deps": attr.label_list(
++            mandatory = True,
++            providers = [ProtoInfo],
++        ),
++    },
++    doc = "Adapts `proto_library` from `@rules_proto` to be used with `{cc,py}_proto_library` from this file.",
++)
++
+ def cc_proto_library(
+         name,
+         srcs = [],
+@@ -229,7 +255,6 @@ def cc_proto_library(
+         cc_libs = [],
+         include = None,
+         protoc = "@com_google_protobuf//:protoc",
+-        internal_bootstrap_hack = False,
+         use_grpc_plugin = False,
+         default_runtime = "@com_google_protobuf//:protobuf",
+         **kargs):
+@@ -247,41 +272,17 @@ def cc_proto_library(
+           cc_library.
+       include: a string indicating the include path of the .proto files.
+       protoc: the label of the protocol compiler to generate the sources.
+-      internal_bootstrap_hack: a flag indicate the cc_proto_library is used only
+-          for bootstraping. When it is set to True, no files will be generated.
+-          The rule will simply be a provider for .proto files, so that other
+-          cc_proto_library can depend on it.
+       use_grpc_plugin: a flag to indicate whether to call the grpc C++ plugin
+           when processing the proto files.
+       default_runtime: the implicitly default runtime which will be depended on by
+           the generated cc_library target.
+       **kargs: other keyword arguments that are passed to cc_library.
+-
+     """
+ 
+     includes = []
+     if include != None:
+         includes = [include]
+ 
+-    if internal_bootstrap_hack:
+-        # For pre-checked-in generated files, we add the internal_bootstrap_hack
+-        # which will skip the codegen action.
+-        proto_gen(
+-            name = name + "_genproto",
+-            srcs = srcs,
+-            deps = [s + "_genproto" for s in deps],
+-            includes = includes,
+-            protoc = protoc,
+-            visibility = ["//visibility:public"],
+-        )
+-
+-        # An empty cc_library to make rule dependency consistent.
+-        native.cc_library(
+-            name = name,
+-            **kargs
+-        )
+-        return
+-
+     grpc_cpp_plugin = None
+     if use_grpc_plugin:
+         grpc_cpp_plugin = "//external:grpc_cpp_plugin"
 diff --git a/python/google/protobuf/pyext/message.cc b/python/google/protobuf/pyext/message.cc
-index 3530a9b..c31fa8f 100644
+index 3530a9b37..c31fa8fcc 100644
 --- a/python/google/protobuf/pyext/message.cc
 +++ b/python/google/protobuf/pyext/message.cc
 @@ -2991,8 +2991,12 @@ bool InitProto2MessageModule(PyObject *m) {
@@ -58,6 +186,3 @@ index 3530a9b..c31fa8f 100644
      if (collections == NULL) {
        return false;
      }
--- 
-2.33.0
-
diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 2fdd9d4a0a75b9..246f98e368c968 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "33fe78c623180cafec88bacb6ab43abc2c7f0fb4"
-    TFRT_SHA256 = "9f125298c23d5f3c68a4009b598c37d0f9f33502698d2198c73cc91989b03173"
+    TFRT_COMMIT = "7a6dcd91d1439a974eb3abacdaf4b0714cb60829"
+    TFRT_SHA256 = "0015b3cc84b7a67ec21753bc3c00b14fbfb4de41dc33373e3fa24d0d04d54b07"
 
     tf_http_archive(
         name = "tf_runtime",