Skip to content

Commit

Permalink
Refactor Inference API and rename it to Serverless Inference Endpoints (
Browse files Browse the repository at this point in the history
#7295)

* changes

* changes

* add changeset

* add changeset

* changes

* all pipelines

* format

* clean

* add examples

* fix audio classification

* format

* format

* fix all pipelines

* fixes

* fixes

* fix tabular

* add changeset

* added future

---------

Co-authored-by: gradio-pr-bot <gradio-pr-bot@users.noreply.github.com>
  • Loading branch information
abidlabs and gradio-pr-bot committed Feb 6, 2024
1 parent 733ca26 commit aea14c4
Show file tree
Hide file tree
Showing 8 changed files with 371 additions and 397 deletions.
5 changes: 5 additions & 0 deletions .changeset/tired-suns-judge.md
@@ -0,0 +1,5 @@
---
"gradio": minor
---

feat:Refactor Inference API and rename it to Serverless Inference Endpoints
589 changes: 241 additions & 348 deletions gradio/external.py

Large diffs are not rendered by default.

125 changes: 108 additions & 17 deletions gradio/external_utils.py
@@ -1,14 +1,15 @@
"""Utility function for gradio/external.py"""
"""Utility function for gradio/external.py, designed for internal use."""

from __future__ import annotations

import base64
import math
import operator
import re
import warnings
from typing import Dict, List, Tuple

import httpx
import yaml
from huggingface_hub import InferenceClient

from gradio import components

Expand All @@ -17,7 +18,7 @@
##################


def get_tabular_examples(model_name: str) -> Dict[str, List[float]]:
def get_tabular_examples(model_name: str) -> dict[str, list[float]]:
readme = httpx.get(f"https://huggingface.co/{model_name}/resolve/main/README.md")
if readme.status_code != 200:
warnings.warn(f"Cannot load examples from README for {model_name}", UserWarning)
Expand All @@ -39,7 +40,7 @@ def get_tabular_examples(model_name: str) -> Dict[str, List[float]]:
"See the README.md here: https://huggingface.co/scikit-learn/tabular-playground/blob/main/README.md "
"for a reference on how to provide example data to your model."
)
# replace nan with string NaN for inference API
# replace nan with string NaN for inference Endpoints
for data in example_data.values():
for i, val in enumerate(data):
if isinstance(val, float) and math.isnan(val):
Expand All @@ -48,8 +49,8 @@ def get_tabular_examples(model_name: str) -> Dict[str, List[float]]:


def cols_to_rows(
example_data: Dict[str, List[float]],
) -> Tuple[List[str], List[List[float]]]:
example_data: dict[str, list[float]],
) -> tuple[list[str], list[list[float]]]:
headers = list(example_data.keys())
n_rows = max(len(example_data[header] or []) for header in headers)
data = []
Expand All @@ -65,7 +66,7 @@ def cols_to_rows(
return headers, data


def rows_to_cols(incoming_data: Dict) -> Dict[str, Dict[str, Dict[str, List[str]]]]:
def rows_to_cols(incoming_data: dict) -> dict[str, dict[str, dict[str, list[str]]]]:
data_column_wise = {}
for i, header in enumerate(incoming_data["headers"]):
data_column_wise[header] = [str(row[i]) for row in incoming_data["data"]]
Expand All @@ -77,14 +78,43 @@ def rows_to_cols(incoming_data: Dict) -> Dict[str, Dict[str, Dict[str, List[str]
##################


def postprocess_label(scores: Dict) -> Dict:
sorted_pred = sorted(scores.items(), key=operator.itemgetter(1), reverse=True)
return {
"label": sorted_pred[0][0],
"confidences": [
{"label": pred[0], "confidence": pred[1]} for pred in sorted_pred
],
}
def postprocess_label(scores: list[dict[str, str | float]]) -> dict:
return {c["label"]: c["score"] for c in scores}


def postprocess_mask_tokens(scores: list[dict[str, str | float]]) -> dict:
return {c["token_str"]: c["score"] for c in scores}


def postprocess_question_answering(answer: dict) -> tuple[str, dict]:
return answer["answer"], {answer["answer"]: answer["score"]}


def postprocess_visual_question_answering(scores: list[dict[str, str | float]]) -> dict:
return {c["answer"]: c["score"] for c in scores}


def zero_shot_classification_wrapper(client: InferenceClient):
def zero_shot_classification_inner(input: str, labels: str, multi_label: bool):
return client.zero_shot_classification(
input, labels.split(","), multi_label=multi_label
)

return zero_shot_classification_inner


def sentence_similarity_wrapper(client: InferenceClient):
def sentence_similarity_inner(input: str, sentences: str):
return client.sentence_similarity(input, sentences.split("\n"))

return sentence_similarity_inner


def text_generation_wrapper(client: InferenceClient):
def text_generation_inner(input: str):
return input + client.text_generation(input)

return text_generation_inner


def encode_to_base64(r: httpx.Response) -> str:
Expand Down Expand Up @@ -113,12 +143,73 @@ def encode_to_base64(r: httpx.Response) -> str:
return new_base64


def format_ner_list(input_string: str, ner_groups: list[dict[str, str | int]]):
if len(ner_groups) == 0:
return [(input_string, None)]

output = []
end = 0
prev_end = 0

for group in ner_groups:
entity, start, end = group["entity_group"], group["start"], group["end"]
output.append((input_string[prev_end:start], None))
output.append((input_string[start:end], entity))
prev_end = end

output.append((input_string[end:], None))
return output


def token_classification_wrapper(client: InferenceClient):
def token_classification_inner(input: str):
ner_list = client.token_classification(input)
return format_ner_list(input, ner_list) # type: ignore

return token_classification_inner


def chatbot_preprocess(text, state):
if not state:
return text, [], []
return (
text,
state["conversation"]["generated_responses"],
state["conversation"]["past_user_inputs"],
)


def chatbot_postprocess(response):
chatbot_history = list(
zip(
response["conversation"]["past_user_inputs"],
response["conversation"]["generated_responses"],
)
)
return chatbot_history, response


def tabular_wrapper(client: InferenceClient, pipeline: str):
# This wrapper is needed to handle an issue in the InfereneClient where the model name is not
# automatically loaded when using the tabular_classification and tabular_regression methods.
# See: https://github.com/huggingface/huggingface_hub/issues/2015
def tabular_inner(data):
assert pipeline in ["tabular_classification", "tabular_regression"]
assert client.model is not None
if pipeline == "tabular_classification":
return client.tabular_classification(data, model=client.model)
else:
return client.tabular_regression(data, model=client.model)

return tabular_inner


##################
# Helper function for cleaning up an Interface loaded from HF Spaces
##################


def streamline_spaces_interface(config: Dict) -> Dict:
def streamline_spaces_interface(config: dict) -> dict:
"""Streamlines the interface config dictionary to remove unnecessary keys."""
config["inputs"] = [
components.get_component_instance(component)
Expand Down
18 changes: 0 additions & 18 deletions gradio/utils.py
Expand Up @@ -386,24 +386,6 @@ def same_children_recursive(children1, chidren2):
return True


def format_ner_list(input_string: str, ner_groups: list[dict[str, str | int]]):
if len(ner_groups) == 0:
return [(input_string, None)]

output = []
end = 0
prev_end = 0

for group in ner_groups:
entity, start, end = group["entity_group"], group["start"], group["end"]
output.append((input_string[prev_end:start], None))
output.append((input_string[start:end], entity))
prev_end = end

output.append((input_string[end:], None))
return output


def delete_none(_dict: dict, skip_value: bool = False) -> dict:
"""
Delete keys whose values are None from a dictionary
Expand Down
Expand Up @@ -12,9 +12,9 @@ The Hugging Face Hub is a central platform that has hundreds of thousands of [mo
Gradio has multiple features that make it extremely easy to leverage existing models and Spaces on the Hub. This guide walks through these features.


## Demos with the Hugging Face Inference API
## Demos with the Hugging Face Inference Endpoints

Hugging Face has a free service called the [Inference API](https://huggingface.co/inference-api), which allows you to send HTTP requests to models in the Hub. For transformers or diffusers-based models, the API can be 2 to 10 times faster than running the inference yourself. The API is free (rate limited), and you can switch to dedicated [Inference Endpoints](https://huggingface.co/pricing) when you want to use it in production. Gradio integrates directly with the Hugging Face Inference API so that you can create a demo simply by specifying a model's name (e.g. `Helsinki-NLP/opus-mt-en-es`), like this:
Hugging Face has a service called [Serverless Inference Endpoints](https://huggingface.co/docs/api-inference/index), which allows you to send HTTP requests to models on the Hub. The API includes a generous free tier, and you can switch to [dedicated Inference Endpoints](https://huggingface.co/inference-endpoints/dedicated) when you want to use it in production. Gradio integrates directly with Serverless Inference Endpoints so that you can create a demo simply by specifying a model's name (e.g. `Helsinki-NLP/opus-mt-en-es`), like this:

```python
import gradio as gr
Expand All @@ -24,11 +24,11 @@ demo = gr.load("Helsinki-NLP/opus-mt-en-es", src="models")
demo.launch()
```

For any Hugging Face model supported in the Inference API, Gradio automatically infers the expected input and output and make the underlying server calls, so you don't have to worry about defining the prediction function.
For any Hugging Face model supported in Inference Endpoints, Gradio automatically infers the expected input and output and make the underlying server calls, so you don't have to worry about defining the prediction function.

Notice that we just put specify the model name and state that the `src` should be `models` (Hugging Face's Model Hub). There is no need to install any dependencies (except `gradio`) since you are not loading the model on your computer.

You might notice that the first inference takes about 20 seconds. This happens since the Inference API is loading the model in the server. You get some benefits afterward:
You might notice that the first inference takes a little bit longer. This happens since the Inference Endpoints is loading the model in the server. You get some benefits afterward:

- The inference will be much faster.
- The server caches your requests.
Expand Down Expand Up @@ -78,7 +78,7 @@ with gr.Blocks() as demo:
demo.launch()
```

Notice that we use `gr.load()`, the same method we used to load models using the Inference API. However, here we specify that the `src` is `spaces` (Hugging Face Spaces).
Notice that we use `gr.load()`, the same method we used to load models using Inference Endpoints. However, here we specify that the `src` is `spaces` (Hugging Face Spaces).

Note: loading a Space in this way may result in slight differences from the original Space. In particular, any attributes that apply to the entire Blocks, such as the theme or custom CSS/JS, will not be loaded. You can copy these properties from the Space you are loading into your own `Blocks` object.

Expand Down Expand Up @@ -126,7 +126,7 @@ The previous code produces the following interface, which you can try right here

That's it! Let's recap the various ways Gradio and Hugging Face work together:

1. You can build a demo around the Inference API without having to load the model easily using `gr.load()`.
1. You can build a demo around Inference Endpoints without having to load the model, by using `gr.load()`.
2. You host your Gradio demo on Hugging Face Spaces, either using the GUI or entirely in Python.
3. You can load demos from Hugging Face Spaces to remix and create new Gradio demos using `gr.load()`.
4. You can convert a `transformers` pipeline into a Gradio demo using `from_pipeline()`.
Expand Down
Expand Up @@ -52,11 +52,11 @@ demo.launch()

<gradio-app space="Helsinki-NLP/opus-mt-en-es"></gradio-app>

## Using Hugging Face Inference API
## Using Hugging Face Inference Endpoints

Hugging Face 提供了一个名为[Inference API](https://huggingface.co/inference-api)的免费服务,允许您向 Hub 中的模型发送 HTTP 请求。对于基于 transformers 或 diffusers 的模型,API 的速度可以比自己运行推理快 2 到 10 倍。该 API 是免费的(受速率限制),您可以在想要在生产中使用时切换到专用的[推理端点](https://huggingface.co/pricing)
Hugging Face 提供了一个名为[Serverless Inference Endpoints](https://huggingface.co/inference-api)的免费服务,允许您向 Hub 中的模型发送 HTTP 请求。对于基于 transformers 或 diffusers 的模型,API 的速度可以比自己运行推理快 2 到 10 倍。该 API 是免费的(受速率限制),您可以在想要在生产中使用时切换到专用的[推理端点](https://huggingface.co/pricing)

让我们尝试使用推理 API 而不是自己加载模型的方式进行相同的演示。鉴于 Inference API 支持的 Hugging Face 模型,Gradio 可以自动推断出预期的输入和输出,并进行底层服务器调用,因此您不必担心定义预测函数。以下是代码示例!
让我们尝试使用推理 API 而不是自己加载模型的方式进行相同的演示。鉴于 Inference Endpoints 支持的 Hugging Face 模型,Gradio 可以自动推断出预期的输入和输出,并进行底层服务器调用,因此您不必担心定义预测函数。以下是代码示例!

```python
import gradio as gr
Expand Down
11 changes: 7 additions & 4 deletions test/test_external.py
Expand Up @@ -13,7 +13,8 @@
import gradio as gr
from gradio.context import Context
from gradio.exceptions import GradioVersionIncompatibleError, InvalidApiNameError
from gradio.external import TooManyRequestsError, cols_to_rows, get_tabular_examples
from gradio.external import TooManyRequestsError
from gradio.external_utils import cols_to_rows, get_tabular_examples

"""
WARNING: These tests have an external dependency: namely that Hugging Face's
Expand Down Expand Up @@ -204,7 +205,7 @@ def test_sentiment_model(self):
def test_image_classification_model(self):
io = gr.load(name="models/google/vit-base-patch16-224")
try:
assert io("gradio/test_data/lion.jpg")["label"] == "lion"
assert io("gradio/test_data/lion.jpg")["label"].startswith("lion")
except TooManyRequestsError:
pass

Expand Down Expand Up @@ -291,7 +292,9 @@ def test_text_to_image_model(self):
io = gr.load("models/osanseviero/BigGAN-deep-128")
try:
filename = io("chest")
assert filename.endswith(".jpg") or filename.endswith(".jpeg")
assert filename.lower().endswith(".jpg") or filename.lower().endswith(
".jpeg"
)
except TooManyRequestsError:
pass

Expand Down Expand Up @@ -491,7 +494,7 @@ def test_load_blocks_with_default_values():
)
def test_can_load_tabular_model_with_different_widget_data(hypothetical_readme):
with patch(
"gradio.external.get_tabular_examples", return_value=hypothetical_readme
"gradio.external_utils.get_tabular_examples", return_value=hypothetical_readme
):
io = gr.load("models/scikit-learn/tabular-playground")
check_dataframe(io.config)
Expand Down
2 changes: 1 addition & 1 deletion test/test_utils.py
Expand Up @@ -12,14 +12,14 @@
from typing_extensions import Literal

from gradio import EventData, Request
from gradio.external_utils import format_ner_list
from gradio.utils import (
abspath,
append_unique_suffix,
assert_configs_are_equivalent_besides_ids,
check_function_inputs_match,
colab_check,
delete_none,
format_ner_list,
get_continuous_fn,
get_extension_from_file_path_or_url,
get_type_hints,
Expand Down

0 comments on commit aea14c4

Please sign in to comment.