Refactor Inference API and rename it to Serverless Inference Endpoints (

#7295) * changes * changes * add changeset * add changeset * changes * all pipelines * format * clean * add examples * fix audio classification * format * format * fix all pipelines * fixes * fixes * fix tabular * add changeset * added future --------- Co-authored-by: gradio-pr-bot <gradio-pr-bot@users.noreply.github.com>
gradio-app · Feb 6, 2024 · aea14c4 · aea14c4
1 parent 733ca26
commit aea14c4
Show file tree

Hide file tree

Showing 8 changed files with 371 additions and 397 deletions.
diff --git a/.changeset/tired-suns-judge.md b/.changeset/tired-suns-judge.md
@@ -0,0 +1,5 @@
+---
+"gradio": minor
+---
+
+feat:Refactor Inference API and rename it to Serverless Inference Endpoints
diff --git a/gradio/external.py b/gradio/external.py
diff --git a/gradio/external_utils.py b/gradio/external_utils.py
@@ -1,14 +1,15 @@
-"""Utility function for gradio/external.py"""
+"""Utility function for gradio/external.py, designed for internal use."""
+
+from __future__ import annotations
 
 import base64
 import math
-import operator
 import re
 import warnings
-from typing import Dict, List, Tuple
 
 import httpx
 import yaml
+from huggingface_hub import InferenceClient
 
 from gradio import components
 
@@ -17,7 +18,7 @@
 ##################
 
 
-def get_tabular_examples(model_name: str) -> Dict[str, List[float]]:
+def get_tabular_examples(model_name: str) -> dict[str, list[float]]:
     readme = httpx.get(f"https://huggingface.co/{model_name}/resolve/main/README.md")
     if readme.status_code != 200:
         warnings.warn(f"Cannot load examples from README for {model_name}", UserWarning)
@@ -39,7 +40,7 @@ def get_tabular_examples(model_name: str) -> Dict[str, List[float]]:
             "See the README.md here: https://huggingface.co/scikit-learn/tabular-playground/blob/main/README.md "
             "for a reference on how to provide example data to your model."
         )
-    # replace nan with string NaN for inference API
+    # replace nan with string NaN for inference Endpoints
     for data in example_data.values():
         for i, val in enumerate(data):
             if isinstance(val, float) and math.isnan(val):
@@ -48,8 +49,8 @@ def get_tabular_examples(model_name: str) -> Dict[str, List[float]]:
 
 
 def cols_to_rows(
-    example_data: Dict[str, List[float]],
-) -> Tuple[List[str], List[List[float]]]:
+    example_data: dict[str, list[float]],
+) -> tuple[list[str], list[list[float]]]:
     headers = list(example_data.keys())
     n_rows = max(len(example_data[header] or []) for header in headers)
     data = []
@@ -65,7 +66,7 @@ def cols_to_rows(
     return headers, data
 
 
-def rows_to_cols(incoming_data: Dict) -> Dict[str, Dict[str, Dict[str, List[str]]]]:
+def rows_to_cols(incoming_data: dict) -> dict[str, dict[str, dict[str, list[str]]]]:
     data_column_wise = {}
     for i, header in enumerate(incoming_data["headers"]):
         data_column_wise[header] = [str(row[i]) for row in incoming_data["data"]]
@@ -77,14 +78,43 @@ def rows_to_cols(incoming_data: Dict) -> Dict[str, Dict[str, Dict[str, List[str]
 ##################
 
 
-def postprocess_label(scores: Dict) -> Dict:
-    sorted_pred = sorted(scores.items(), key=operator.itemgetter(1), reverse=True)
-    return {
-        "label": sorted_pred[0][0],
-        "confidences": [
-            {"label": pred[0], "confidence": pred[1]} for pred in sorted_pred
-        ],
-    }
+def postprocess_label(scores: list[dict[str, str | float]]) -> dict:
+    return {c["label"]: c["score"] for c in scores}
+
+
+def postprocess_mask_tokens(scores: list[dict[str, str | float]]) -> dict:
+    return {c["token_str"]: c["score"] for c in scores}
+
+
+def postprocess_question_answering(answer: dict) -> tuple[str, dict]:
+    return answer["answer"], {answer["answer"]: answer["score"]}
+
+
+def postprocess_visual_question_answering(scores: list[dict[str, str | float]]) -> dict:
+    return {c["answer"]: c["score"] for c in scores}
+
+
+def zero_shot_classification_wrapper(client: InferenceClient):
+    def zero_shot_classification_inner(input: str, labels: str, multi_label: bool):
+        return client.zero_shot_classification(
+            input, labels.split(","), multi_label=multi_label
+        )
+
+    return zero_shot_classification_inner
+
+
+def sentence_similarity_wrapper(client: InferenceClient):
+    def sentence_similarity_inner(input: str, sentences: str):
+        return client.sentence_similarity(input, sentences.split("\n"))
+
+    return sentence_similarity_inner
+
+
+def text_generation_wrapper(client: InferenceClient):
+    def text_generation_inner(input: str):
+        return input + client.text_generation(input)
+
+    return text_generation_inner
 
 
 def encode_to_base64(r: httpx.Response) -> str:
@@ -113,12 +143,73 @@ def encode_to_base64(r: httpx.Response) -> str:
         return new_base64
 
 
+def format_ner_list(input_string: str, ner_groups: list[dict[str, str | int]]):
+    if len(ner_groups) == 0:
+        return [(input_string, None)]
+
+    output = []
+    end = 0
+    prev_end = 0
+
+    for group in ner_groups:
+        entity, start, end = group["entity_group"], group["start"], group["end"]
+        output.append((input_string[prev_end:start], None))
+        output.append((input_string[start:end], entity))
+        prev_end = end
+
+    output.append((input_string[end:], None))
+    return output
+
+
+def token_classification_wrapper(client: InferenceClient):
+    def token_classification_inner(input: str):
+        ner_list = client.token_classification(input)
+        return format_ner_list(input, ner_list)  # type: ignore
+
+    return token_classification_inner
+
+
+def chatbot_preprocess(text, state):
+    if not state:
+        return text, [], []
+    return (
+        text,
+        state["conversation"]["generated_responses"],
+        state["conversation"]["past_user_inputs"],
+    )
+
+
+def chatbot_postprocess(response):
+    chatbot_history = list(
+        zip(
+            response["conversation"]["past_user_inputs"],
+            response["conversation"]["generated_responses"],
+        )
+    )
+    return chatbot_history, response
+
+
+def tabular_wrapper(client: InferenceClient, pipeline: str):
+    # This wrapper is needed to handle an issue in the InfereneClient where the model name is not
+    # automatically loaded when using the tabular_classification and tabular_regression methods.
+    # See: https://github.com/huggingface/huggingface_hub/issues/2015
+    def tabular_inner(data):
+        assert pipeline in ["tabular_classification", "tabular_regression"]
+        assert client.model is not None
+        if pipeline == "tabular_classification":
+            return client.tabular_classification(data, model=client.model)
+        else:
+            return client.tabular_regression(data, model=client.model)
+
+    return tabular_inner
+
+
 ##################
 # Helper function for cleaning up an Interface loaded from HF Spaces
 ##################
 
 
-def streamline_spaces_interface(config: Dict) -> Dict:
+def streamline_spaces_interface(config: dict) -> dict:
     """Streamlines the interface config dictionary to remove unnecessary keys."""
     config["inputs"] = [
         components.get_component_instance(component)

diff --git a/gradio/utils.py b/gradio/utils.py
@@ -386,24 +386,6 @@ def same_children_recursive(children1, chidren2):
     return True
 
 
-def format_ner_list(input_string: str, ner_groups: list[dict[str, str | int]]):
-    if len(ner_groups) == 0:
-        return [(input_string, None)]
-
-    output = []
-    end = 0
-    prev_end = 0
-
-    for group in ner_groups:
-        entity, start, end = group["entity_group"], group["start"], group["end"]
-        output.append((input_string[prev_end:start], None))
-        output.append((input_string[start:end], entity))
-        prev_end = end
-
-    output.append((input_string[end:], None))
-    return output
-
-
 def delete_none(_dict: dict, skip_value: bool = False) -> dict:
     """
     Delete keys whose values are None from a dictionary

diff --git a/guides/06_integrating-other-frameworks/01_using-hugging-face-integrations.md b/guides/06_integrating-other-frameworks/01_using-hugging-face-integrations.md
@@ -12,9 +12,9 @@ The Hugging Face Hub is a central platform that has hundreds of thousands of [mo
 Gradio has multiple features that make it extremely easy to leverage existing models and Spaces on the Hub. This guide walks through these features.
 
 
-## Demos with the Hugging Face Inference API
+## Demos with the Hugging Face Inference Endpoints
 
-Hugging Face has a free service called the [Inference API](https://huggingface.co/inference-api), which allows you to send HTTP requests to models in the Hub. For transformers or diffusers-based models, the API can be 2 to 10 times faster than running the inference yourself. The API is free (rate limited), and you can switch to dedicated [Inference Endpoints](https://huggingface.co/pricing) when you want to use it in production. Gradio integrates directly with the Hugging Face Inference API so that you can create a demo simply by specifying a model's name (e.g. `Helsinki-NLP/opus-mt-en-es`), like this:
+Hugging Face has a service called [Serverless Inference Endpoints](https://huggingface.co/docs/api-inference/index), which allows you to send HTTP requests to models on the Hub. The API includes a generous free tier, and you can switch to [dedicated Inference Endpoints](https://huggingface.co/inference-endpoints/dedicated) when you want to use it in production. Gradio integrates directly with Serverless Inference Endpoints so that you can create a demo simply by specifying a model's name (e.g. `Helsinki-NLP/opus-mt-en-es`), like this:
 
 ```python
 import gradio as gr
@@ -24,11 +24,11 @@ demo = gr.load("Helsinki-NLP/opus-mt-en-es", src="models")
 demo.launch()
 ```
 
-For any Hugging Face model supported in the Inference API, Gradio automatically infers the expected input and output and make the underlying server calls, so you don't have to worry about defining the prediction function. 
+For any Hugging Face model supported in Inference Endpoints, Gradio automatically infers the expected input and output and make the underlying server calls, so you don't have to worry about defining the prediction function. 
 
 Notice that we just put specify the model name and state that the `src` should be `models` (Hugging Face's Model Hub). There is no need to install any dependencies (except `gradio`) since you are not loading the model on your computer.
 
-You might notice that the first inference takes about 20 seconds. This happens since the Inference API is loading the model in the server. You get some benefits afterward:
+You might notice that the first inference takes a little bit longer. This happens since the Inference Endpoints is loading the model in the server. You get some benefits afterward:
 
 - The inference will be much faster.
 - The server caches your requests.
@@ -78,7 +78,7 @@ with gr.Blocks() as demo:
 demo.launch()
 ```
 
-Notice that we use `gr.load()`, the same method we used to load models using the Inference API. However, here we specify that the `src` is `spaces` (Hugging Face Spaces). 
+Notice that we use `gr.load()`, the same method we used to load models using Inference Endpoints. However, here we specify that the `src` is `spaces` (Hugging Face Spaces). 
 
 Note: loading a Space in this way may result in slight differences from the original Space. In particular, any attributes that apply to the entire Blocks, such as the theme or custom CSS/JS, will not be loaded. You can copy these properties from the Space you are loading into your own `Blocks` object. 
 
@@ -126,7 +126,7 @@ The previous code produces the following interface, which you can try right here
 
 That's it! Let's recap the various ways Gradio and Hugging Face work together:
 
-1. You can build a demo around the Inference API without having to load the model easily using `gr.load()`.
+1. You can build a demo around Inference Endpoints without having to load the model, by using `gr.load()`.
 2. You host your Gradio demo on Hugging Face Spaces, either using the GUI or entirely in Python.
 3. You can load demos from Hugging Face Spaces to remix and create new Gradio demos using `gr.load()`.
 4. You can convert a `transformers` pipeline into a Gradio demo using `from_pipeline()`.

diff --git a/guides/cn/04_integrating-other-frameworks/01_using-hugging-face-integrations.md b/guides/cn/04_integrating-other-frameworks/01_using-hugging-face-integrations.md
@@ -52,11 +52,11 @@ demo.launch()
 
 <gradio-app space="Helsinki-NLP/opus-mt-en-es"></gradio-app>
 
-## Using Hugging Face Inference API
+## Using Hugging Face Inference Endpoints
 
-Hugging Face 提供了一个名为[Inference API](https://huggingface.co/inference-api)的免费服务，允许您向 Hub 中的模型发送 HTTP 请求。对于基于 transformers 或 diffusers 的模型，API 的速度可以比自己运行推理快 2 到 10 倍。该 API 是免费的（受速率限制），您可以在想要在生产中使用时切换到专用的[推理端点](https://huggingface.co/pricing)。
+Hugging Face 提供了一个名为[Serverless Inference Endpoints](https://huggingface.co/inference-api)的免费服务，允许您向 Hub 中的模型发送 HTTP 请求。对于基于 transformers 或 diffusers 的模型，API 的速度可以比自己运行推理快 2 到 10 倍。该 API 是免费的（受速率限制），您可以在想要在生产中使用时切换到专用的[推理端点](https://huggingface.co/pricing)。
 
-让我们尝试使用推理 API 而不是自己加载模型的方式进行相同的演示。鉴于 Inference API 支持的 Hugging Face 模型，Gradio 可以自动推断出预期的输入和输出，并进行底层服务器调用，因此您不必担心定义预测函数。以下是代码示例！
+让我们尝试使用推理 API 而不是自己加载模型的方式进行相同的演示。鉴于 Inference Endpoints 支持的 Hugging Face 模型，Gradio 可以自动推断出预期的输入和输出，并进行底层服务器调用，因此您不必担心定义预测函数。以下是代码示例！
 
 ```python
 import gradio as gr

diff --git a/test/test_external.py b/test/test_external.py
@@ -13,7 +13,8 @@
 import gradio as gr
 from gradio.context import Context
 from gradio.exceptions import GradioVersionIncompatibleError, InvalidApiNameError
-from gradio.external import TooManyRequestsError, cols_to_rows, get_tabular_examples
+from gradio.external import TooManyRequestsError
+from gradio.external_utils import cols_to_rows, get_tabular_examples
 
 """
 WARNING: These tests have an external dependency: namely that Hugging Face's
@@ -204,7 +205,7 @@ def test_sentiment_model(self):
     def test_image_classification_model(self):
         io = gr.load(name="models/google/vit-base-patch16-224")
         try:
-            assert io("gradio/test_data/lion.jpg")["label"] == "lion"
+            assert io("gradio/test_data/lion.jpg")["label"].startswith("lion")
         except TooManyRequestsError:
             pass
 
@@ -291,7 +292,9 @@ def test_text_to_image_model(self):
         io = gr.load("models/osanseviero/BigGAN-deep-128")
         try:
             filename = io("chest")
-            assert filename.endswith(".jpg") or filename.endswith(".jpeg")
+            assert filename.lower().endswith(".jpg") or filename.lower().endswith(
+                ".jpeg"
+            )
         except TooManyRequestsError:
             pass
 
@@ -491,7 +494,7 @@ def test_load_blocks_with_default_values():
 )
 def test_can_load_tabular_model_with_different_widget_data(hypothetical_readme):
     with patch(
-        "gradio.external.get_tabular_examples", return_value=hypothetical_readme
+        "gradio.external_utils.get_tabular_examples", return_value=hypothetical_readme
     ):
         io = gr.load("models/scikit-learn/tabular-playground")
         check_dataframe(io.config)

diff --git a/test/test_utils.py b/test/test_utils.py
@@ -12,14 +12,14 @@
 from typing_extensions import Literal
 
 from gradio import EventData, Request
+from gradio.external_utils import format_ner_list
 from gradio.utils import (
     abspath,
     append_unique_suffix,
     assert_configs_are_equivalent_besides_ids,
     check_function_inputs_match,
     colab_check,
     delete_none,
-    format_ner_list,
     get_continuous_fn,
     get_extension_from_file_path_or_url,
     get_type_hints,