From 1b340b5be312313a40f41b6db1e2a76d7c45ef0d Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Thu, 28 Mar 2024 19:02:30 -0700 Subject: [PATCH 01/43] Revert "Revert "Revert "Revert "Revert "Revert "Revert "Revert "Gradio k8"""""""" This reverts commit c68ab476afb69060f2942a77859792315972cef4. --- reqs_optional/reqs_constraints.txt | 6 ++++-- reqs_optional/requirements_optional_langchain.txt | 2 +- requirements.txt | 8 ++++---- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/reqs_optional/reqs_constraints.txt b/reqs_optional/reqs_constraints.txt index 5b5cf06f4..acc0b9ba0 100644 --- a/reqs_optional/reqs_constraints.txt +++ b/reqs_optional/reqs_constraints.txt @@ -1,5 +1,7 @@ # ensure doesn't drift, e.g. Issue #1348 torch==2.2.1 -gradio==4.20.1 -gradio_client==0.11.0 +#gradio==4.20.1 +#gradio_client==0.11.0 +gradio==3.50.2 +gradio_client==0.6.1 transformers==4.39.2 diff --git a/reqs_optional/requirements_optional_langchain.txt b/reqs_optional/requirements_optional_langchain.txt index 3235587ac..a8c37cb35 100644 --- a/reqs_optional/requirements_optional_langchain.txt +++ b/reqs_optional/requirements_optional_langchain.txt @@ -92,7 +92,7 @@ weaviate-client>=3.25.3 # vllm==0.2.2 # only gradio>=4 -gradio_pdf>=0.0.4 +#gradio_pdf>=0.0.4 gradio_tools>=0.0.9 diff --git a/requirements.txt b/requirements.txt index a4af9cb11..e05249cd4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,8 +4,8 @@ sentencepiece==0.2.0 # no websockets, more cloud friendly #gradio==4.19.2 # able to make gradio clean-up states -gradio @ https://h2o-release.s3.amazonaws.com/h2ogpt/gradio-4.20.1-py3-none-any.whl -#gradio==3.50.2 +#gradio @ https://h2o-release.s3.amazonaws.com/h2ogpt/gradio-4.20.1-py3-none-any.whl +gradio==3.50.2 sse_starlette==1.8.2 # consrained by tokenizers etc.: huggingface_hub>=0.12.4 @@ -40,8 +40,8 @@ boto3>=1.26.101 botocore>=1.29.101 # for gradio client -gradio_client==0.11.0 -#gradio_client==0.6.1 +#gradio_client==0.11.0 +gradio_client==0.6.1 beautifulsoup4>=4.12.2 markdown>=3.4.3 From e1455f7084a767abd6fefdc4513e03ff353107ef Mon Sep 17 00:00:00 2001 From: Achraf Merzouki Date: Fri, 5 Apr 2024 00:52:08 -0400 Subject: [PATCH 02/43] Hot swap vulnerable packages, unused --- docker_build_script_ubuntu.sh | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/docker_build_script_ubuntu.sh b/docker_build_script_ubuntu.sh index fa88c4ad0..e2f26560b 100755 --- a/docker_build_script_ubuntu.sh +++ b/docker_build_script_ubuntu.sh @@ -125,9 +125,13 @@ rm -rf /workspace/helm rm -rf /workspace/notebooks rm -rf /workspace/papers - - - - - - +# Hotswap vulnerable dependencies +wget https://s3.amazonaws.com/artifacts.h2o.ai/deps/h2ogpt/ubuntu20.04/apparmor_4.0.0~alpha2-0ubuntu5_amd64.deb +wget https://s3.amazonaws.com/artifacts.h2o.ai/deps/h2ogpt/ubuntu20.04/libapparmor1_4.0.0~alpha2-0ubuntu5_amd64.deb +dpkg -i libapparmor1_4.0.0~alpha2-0ubuntu5_amd64.deb +dpkg -i apparmor_4.0.0~alpha2-0ubuntu5_amd64.deb +rm -rf libapparmor1_4*.deb apparmor_4*.deb + +wget https://s3.amazonaws.com/artifacts.h2o.ai/deps/h2ogpt/ubuntu20.04/libarchive13_3.6.2-1ubuntu1_amd64.deb +dpkg -i libarchive13_3.6.2-1ubuntu1_amd64.deb +rm -rf libarchive13_3.6.2-1ubuntu1_amd64.deb From e690da50b9d6e1b4e0e85c48e72affaa56100042 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Mon, 8 Apr 2024 02:10:01 -0700 Subject: [PATCH 03/43] Fixes #1535 --- src/gradio_runner.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/gradio_runner.py b/src/gradio_runner.py index 422667acb..cbe55a42c 100644 --- a/src/gradio_runner.py +++ b/src/gradio_runner.py @@ -3919,6 +3919,9 @@ def update_chatbots(*args, eventdb1e = eventdb1d.then(**show_sources_kwargs) eventdb1f = eventdb1e.then(**get_viewable_sources_args) eventdb1g = eventdb1f.then(**viewable_kwargs) + eventdb1h = eventdb1g.then(**update_chatbots_kwargs) + if kwargs['gradio_errors_to_chatbot']: + eventdb1i = eventdb1h.then(**update_chatbots_errors_kwargs) # add text by hitting enter eventdb3c = eventdb3.then(**get_sources_kwargs) @@ -3961,8 +3964,7 @@ def update_chatbots(*args, event_attach5 = event_attach4.then(**show_sources_kwargs) event_attach6 = event_attach5.then(**get_viewable_sources_args) event_attach7 = event_attach6.then(**viewable_kwargs) - if kwargs['gradio_upload_to_chatbot']: - event_attach8 = event_attach7.then(**update_chatbots_kwargs) + event_attach8 = event_attach7.then(**update_chatbots_kwargs) sync2 = sync1.then(**get_sources_kwargs) sync3 = sync2.then(fn=update_dropdown, inputs=docs_state, outputs=document_choice) From 20ae185312c37a0b57f8095f42f1a3990e2f682b Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Mon, 8 Apr 2024 02:11:28 -0700 Subject: [PATCH 04/43] Hide experimental thing from public view --- src/gradio_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gradio_runner.py b/src/gradio_runner.py index cbe55a42c..8f7e1a510 100644 --- a/src/gradio_runner.py +++ b/src/gradio_runner.py @@ -1116,7 +1116,8 @@ def get_df_langchain_mode_paths(selection_docs_state1, db1s, dbs1=None): max_quality = gr.Checkbox(label="Max Ingest Quality", value=kwargs['max_quality'], visible=not is_public) gradio_upload_to_chatbot = gr.Checkbox(label="Add Doc to Chat", - value=kwargs['gradio_upload_to_chatbot']) + value=kwargs['gradio_upload_to_chatbot'], + visible=not is_public) url_text = gr.Textbox(label=url_label, # placeholder="Enter Submits", max_lines=1, From 0a367b9551eea24a57358087e34ddf4abe2c0b73 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Mon, 8 Apr 2024 14:27:03 -0700 Subject: [PATCH 05/43] Handle json vllm timeout for requests --- src/utils.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/utils.py b/src/utils.py index 065ea8124..c4b91f291 100644 --- a/src/utils.py +++ b/src/utils.py @@ -2224,17 +2224,22 @@ def get_vllm_version(openai_client, inference_server, verbose=False): if inference_server.startswith('vllm'): # https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/api_server.py parsed_url = str(openai_client.base_url).replace("/v1", "/version") - response = requests.get(parsed_url) - if response.status_code == 200: - # Parsing the JSON response content to a dictionary - data = response.json() - # Accessing the version from the response - vllm_version = data.get('version', vllm_version) - if verbose: - print(f"vLLM Server version: {vllm_version}") - else: - if verbose: - print(f"Failed to retrieve version, status code: {response.status_code}") + try: + response = requests.get(parsed_url, timeout=int(os.getenv('REQUEST_TIMEOUT', '30'))) + if response.status_code == 200: + # Parsing the JSON response content to a dictionary + data = response.json() + # Accessing the version from the response + vllm_version = data.get('version', vllm_version) + if verbose: + print(f"vLLM Server version: {vllm_version}") + else: + if verbose: + print(f"Failed to retrieve version, status code: {response.status_code}") + except requests.exceptions.Timeout: + # if times out, assume new for newer usage + vllm_version = '0.4.0.post1' + print(f"vLLM Server version timeout, assuming: {vllm_version}") return vllm_version From 208ccf63de55d7076ab65394475986a1eeba38aa Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Mon, 8 Apr 2024 08:46:08 -0700 Subject: [PATCH 06/43] Fix docker with wrong base torch --- Dockerfile | 4 ++-- docker_build_script_ubuntu.sh | 23 +++++++++++++++-------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2333e7b3e..3caa2b6b6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,8 +3,8 @@ FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu20.04 ENV DEBIAN_FRONTEND=noninteractive -ENV PATH="/h2ogpt_conda/bin:${PATH}" -ARG PATH="/h2ogpt_conda/bin:${PATH}" +ENV PATH="/h2ogpt_conda/envs/h2ogpt/bin:${PATH}" +ARG PATH="/h2ogpt_conda/envs/h2ogpt/bin:${PATH}" ENV HOME=/workspace ENV CUDA_HOME=/usr/local/cuda-12.1 diff --git a/docker_build_script_ubuntu.sh b/docker_build_script_ubuntu.sh index b088cd4ed..f269a942f 100755 --- a/docker_build_script_ubuntu.sh +++ b/docker_build_script_ubuntu.sh @@ -32,7 +32,11 @@ wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ mkdir -p /h2ogpt_conda && \ bash ./Miniconda3-latest-Linux-x86_64.sh -b -u -p /h2ogpt_conda && \ conda update -n base conda && \ - conda install python=3.10 pygobject weasyprint -c conda-forge -y + source ~/miniconda3/etc/profile.d/conda.sh && \ + conda create -n h2ogpt -y && \ + conda activate h2ogpt && \ + conda install python=3.10 pygobject weasyprint -c conda-forge -y && \ + echo "h2oGPT conda env: $CONDA_DEFAULT_ENV" # if building for CPU, would remove CMAKE_ARGS and avoid GPU image as base image export LLAMA_CUBLAS=1 @@ -81,14 +85,17 @@ print('Done!') ############################################################ # vllm server export VLLM_CACHE=/workspace/.vllm_cache -cd /h2ogpt_conda -python -m venv vllm_env --system-site-packages +conda create -n vllm -y +source /h2ogpt_conda/etc/profile.d/conda.sh +conda activate vllm +echo "vLLM conda env: $CONDA_DEFAULT_ENV" + # gputil is for rayWorker in vllm to run as non-root # below required outside docker: # apt-get install libnccl2 -/h2ogpt_conda/vllm_env/bin/python -m pip install vllm==0.4.0.post1 -/h2ogpt_conda/vllm_env/bin/python -m pip uninstall flash-attn -y -/h2ogpt_conda/vllm_env/bin/python -m pip install gputil==1.4.0 flash-attn==2.5.6 hf_transfer==0.1.6 +python -m pip install vllm==0.4.0.post1 +python -m pip install gputil==1.4.0 hf_transfer==0.1.6 +python -m pip install flash-attn==2.5.6 --no-build-isolation --no-deps --no-cache-dir # pip install hf_transfer # pip install tiktoken accelerate flash_attn @@ -96,8 +103,8 @@ mkdir $VLLM_CACHE chmod -R a+rwx /h2ogpt_conda # Make sure old python location works in case using scripts from old documentation -mkdir -p /h2ogpt_conda/envs/vllm/bin -ln -s /h2ogpt_conda/vllm_env/bin/python3.10 /h2ogpt_conda/envs/vllm/bin/python3.10 +mkdir -p /h2ogpt_conda/envs/vllm_env/bin +ln -s /h2ogpt_conda/envs/vllm/bin/python3.10 /h2ogpt_conda/vllm_env/bin/python3.10 # Track build info cp /workspace/build_info.txt /build_info.txt From 1f2ad880908b91b3297e06ac755884f7a3b0df97 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Mon, 8 Apr 2024 08:47:44 -0700 Subject: [PATCH 07/43] Remove dead code --- Makefile | 41 ----------------------------------------- docs/README_DOCKER.md | 1 - 2 files changed, 42 deletions(-) diff --git a/Makefile b/Makefile index bfc24c3fd..c20f260f3 100644 --- a/Makefile +++ b/Makefile @@ -47,47 +47,6 @@ build_info.txt: git_hash.txt: @echo "$(shell git rev-parse HEAD)" >> $@ -# Deprecated for now, no 0.4.1 on pypi, use release binary wheel that has no CUDA errors anymore -docker_build_deps: - @cp docker_build_script_ubuntu.sh docker_build_script_ubuntu.sh.back - @sed -i '/# Install prebuilt dependencies/,$$d' docker_build_script_ubuntu.sh - @docker build -t h2ogpt-deps-builder -f Dockerfile . - @mv docker_build_script_ubuntu.sh.back docker_build_script_ubuntu.sh - @mkdir -p prebuilt_deps - @docker run \ - --rm \ - -it \ - --entrypoint bash \ - --runtime nvidia \ - -v `pwd`:/dot \ - -v /etc/passwd:/etc/passwd:ro \ - -v /etc/group:/etc/group:ro \ - -u `id -u`:`id -g` \ - h2ogpt-deps-builder -c " \ - mkdir -p /dot/prebuilt_deps && cd /dot/prebuilt_deps && \ - GITHUB_ACTIONS=true python3.10 -m pip install auto-gptq==0.4.2 --no-cache-dir --use-deprecated=legacy-resolver && \ - python3.10 -m pip wheel auto-gptq==0.4.2 \ - " - @docker run \ - --rm \ - -it \ - --entrypoint bash \ - -v `pwd`:/dot \ - quay.io/pypa/manylinux2014_x86_64 -c " \ - ln -s /usr/local/bin/python3.10 /usr/local/bin/python3 && cd /tmp && \ - git clone https://github.com/h2oai/duckdb.git && \ - cd duckdb && \ - git checkout dcd8c1ffc53dd020623630efb99ba6a3a4cbc5ad && \ - BUILD_PYTHON=1 make release && \ - cd tools/pythonpkg && \ - python3.10 setup.py bdist_wheel && \ - cp dist/duckdb-0.*.whl /dot/prebuilt_deps \ - " - s3cmd put prebuilt_deps/auto_gptq-0.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl s3://artifacts.h2o.ai/deps/h2ogpt/ && \ - s3cmd setacl s3://artifacts.h2o.ai/deps/h2ogpt/auto_gptq-0.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl --acl-public - s3cmd put prebuilt_deps/duckdb-0.8.2.dev4026+gdcd8c1ffc5-cp310-cp310-linux_x86_64.whl s3://artifacts.h2o.ai/deps/h2ogpt/ && \ - s3cmd setacl s3://artifacts.h2o.ai/deps/h2ogpt/duckdb-0.8.2.dev4026+gdcd8c1ffc5-cp310-cp310-linux_x86_64.whl --acl-public - docker_build: build_info.txt git_hash.txt ifeq ($(shell curl --connect-timeout 4 --write-out %{http_code} -sS --output /dev/null -X GET http://harbor.h2o.ai/api/v2.0/projects/h2ogpt/repositories/test-image/artifacts/$(BUILD_TAG)/tags),200) @echo "Image already pushed to Harbor: $(DOCKER_TEST_IMAGE)" diff --git a/docs/README_DOCKER.md b/docs/README_DOCKER.md index ad5a54ca4..8e08e7f27 100644 --- a/docs/README_DOCKER.md +++ b/docs/README_DOCKER.md @@ -392,7 +392,6 @@ touch build_info.txt docker build -t h2ogpt . ``` then to run this version of the docker image, just replace `gcr.io/vorvan/h2oai/h2ogpt-runtime:0.1.0` with `h2ogpt:latest` in above run command. -when any of the prebuilt dependencies are changed, e.g. duckdb or auto-gptq, you need to run `make docker_build_deps` or similar code what's in that Makefile target. ## Docker Compose Setup & Inference From 21ced1ad3fd497f43c801e7768a06bf45da8e735 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Mon, 8 Apr 2024 12:33:21 -0700 Subject: [PATCH 08/43] Relax --- reqs_optional/requirements_optional_langchain.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reqs_optional/requirements_optional_langchain.txt b/reqs_optional/requirements_optional_langchain.txt index f968c983e..b79f4f4bb 100644 --- a/reqs_optional/requirements_optional_langchain.txt +++ b/reqs_optional/requirements_optional_langchain.txt @@ -45,7 +45,7 @@ chroma-migrate==0.0.7 duckdb==0.7.1 chromamigdb @ https://h2o-release.s3.amazonaws.com/h2ogpt/chromamigdb-0.3.26-py3-none-any.whl hnswmiglib @ https://h2o-release.s3.amazonaws.com/h2ogpt/hnswmiglib-0.7.0.tgz -pydantic-settings==2.1.0 +pydantic-settings>=2.1.0 # server vector db #pymilvus==2.2.8 From 793d546c3df792971e280398a7441e275cb162b0 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Mon, 8 Apr 2024 13:59:09 -0700 Subject: [PATCH 09/43] fix path --- docker_build_script_ubuntu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_build_script_ubuntu.sh b/docker_build_script_ubuntu.sh index f269a942f..dd38b7dc6 100755 --- a/docker_build_script_ubuntu.sh +++ b/docker_build_script_ubuntu.sh @@ -32,7 +32,7 @@ wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ mkdir -p /h2ogpt_conda && \ bash ./Miniconda3-latest-Linux-x86_64.sh -b -u -p /h2ogpt_conda && \ conda update -n base conda && \ - source ~/miniconda3/etc/profile.d/conda.sh && \ + source /workspace/etc/profile.d/conda.sh && \ conda create -n h2ogpt -y && \ conda activate h2ogpt && \ conda install python=3.10 pygobject weasyprint -c conda-forge -y && \ From b29aa32ca216d3afeb8d70b5a1efc45b730ea50c Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Mon, 8 Apr 2024 14:29:48 -0700 Subject: [PATCH 10/43] fix path --- docker_build_script_ubuntu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_build_script_ubuntu.sh b/docker_build_script_ubuntu.sh index dd38b7dc6..8cf023c73 100755 --- a/docker_build_script_ubuntu.sh +++ b/docker_build_script_ubuntu.sh @@ -32,7 +32,7 @@ wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \ mkdir -p /h2ogpt_conda && \ bash ./Miniconda3-latest-Linux-x86_64.sh -b -u -p /h2ogpt_conda && \ conda update -n base conda && \ - source /workspace/etc/profile.d/conda.sh && \ + source /h2ogpt_conda/etc/profile.d/conda.sh && \ conda create -n h2ogpt -y && \ conda activate h2ogpt && \ conda install python=3.10 pygobject weasyprint -c conda-forge -y && \ From a20f2b79d008a2af67221ca62836b0f47c408035 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Mon, 8 Apr 2024 17:04:19 -0700 Subject: [PATCH 11/43] add 3.10 --- docker_build_script_ubuntu.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/docker_build_script_ubuntu.sh b/docker_build_script_ubuntu.sh index 8cf023c73..fa29ef0fb 100755 --- a/docker_build_script_ubuntu.sh +++ b/docker_build_script_ubuntu.sh @@ -88,6 +88,7 @@ export VLLM_CACHE=/workspace/.vllm_cache conda create -n vllm -y source /h2ogpt_conda/etc/profile.d/conda.sh conda activate vllm +conda install python=3.10 -y echo "vLLM conda env: $CONDA_DEFAULT_ENV" # gputil is for rayWorker in vllm to run as non-root From 7da7d39286514d5c458310506951e5344fb5dd6c Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Mon, 8 Apr 2024 17:32:02 -0700 Subject: [PATCH 12/43] fix link --- docker_build_script_ubuntu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_build_script_ubuntu.sh b/docker_build_script_ubuntu.sh index fa29ef0fb..8f6ed1413 100755 --- a/docker_build_script_ubuntu.sh +++ b/docker_build_script_ubuntu.sh @@ -104,7 +104,7 @@ mkdir $VLLM_CACHE chmod -R a+rwx /h2ogpt_conda # Make sure old python location works in case using scripts from old documentation -mkdir -p /h2ogpt_conda/envs/vllm_env/bin +mkdir -p /h2ogpt_conda/vllm_env ln -s /h2ogpt_conda/envs/vllm/bin/python3.10 /h2ogpt_conda/vllm_env/bin/python3.10 # Track build info From 2664e1a59d0c142604f7ac31325b7cfdd95cb3d6 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Mon, 8 Apr 2024 18:01:13 -0700 Subject: [PATCH 13/43] fix link --- docker_build_script_ubuntu.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_build_script_ubuntu.sh b/docker_build_script_ubuntu.sh index 8f6ed1413..2a7745f82 100755 --- a/docker_build_script_ubuntu.sh +++ b/docker_build_script_ubuntu.sh @@ -104,7 +104,7 @@ mkdir $VLLM_CACHE chmod -R a+rwx /h2ogpt_conda # Make sure old python location works in case using scripts from old documentation -mkdir -p /h2ogpt_conda/vllm_env +mkdir -p /h2ogpt_conda/vllm_env/bin/ ln -s /h2ogpt_conda/envs/vllm/bin/python3.10 /h2ogpt_conda/vllm_env/bin/python3.10 # Track build info From 8508af290c1d51421b1440dfdf40ccf922e0cda1 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Mon, 8 Apr 2024 19:14:34 -0700 Subject: [PATCH 14/43] Don't double-up json prompting, pass through json related args --- src/gen.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/gen.py b/src/gen.py index 2f57bcd63..428f940fc 100644 --- a/src/gen.py +++ b/src/gen.py @@ -4085,8 +4085,15 @@ def evaluate( stream_output0 = stream_output stream_output = gradio and num_beams == 1 + from gradio_utils.grclient import GradioClient + from gradio_client import Client + gradio_server = inference_server.startswith('http') and ( + isinstance(model, GradioClient) or isinstance(model, Client)) + + # don't repeat prompting if doing gradio server since inner prompting will handle json_vllm = False - if response_format in ['json_object', 'json_code']: + if not gradio_server and \ + response_format in ['json_object', 'json_code']: pre_instruction1 = '\nEnsure your entire response is outputted as a single piece of strict valid JSON text.\n\n' pre_instruction2 = '\nEnsure your entire response is outputted as strict valid JSON text inside a Markdown code block with the json language identifier.\n\n' if isinstance(guided_json, str): @@ -4426,11 +4433,6 @@ def evaluate( # NOT LANGCHAIN PATH, raw LLM # restrict instruction + , typically what has large input - from gradio_utils.grclient import GradioClient - from gradio_client import Client - gradio_server = inference_server.startswith('http') and ( - isinstance(model, GradioClient) or isinstance(model, Client)) - prompt, \ instruction, iinput, context, \ num_prompt_tokens, max_new_tokens, num_prompt_tokens0, num_prompt_tokens_actual, \ From 287f4f3b3538ae8b52134f363ddb5d1f7892d77d Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Tue, 9 Apr 2024 08:43:11 -0700 Subject: [PATCH 15/43] If swtich model lock <-> no model lock, control what's visible better --- src/gradio_runner.py | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/src/gradio_runner.py b/src/gradio_runner.py index 8f7e1a510..8a97a605e 100644 --- a/src/gradio_runner.py +++ b/src/gradio_runner.py @@ -1821,17 +1821,17 @@ def show_llava(x): visible=True, ) guided_json = gr.components.Textbox(value=kwargs['guided_json'], - label="guided_json", - info="https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters-for-chat-api", - visible=True) + label="guided_json", + info="https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters-for-chat-api", + visible=True) guided_regex = gr.components.Textbox(value=kwargs['guided_regex'], - label="guided_regex", - info="https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters-for-chat-api", - visible=True) + label="guided_regex", + info="https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters-for-chat-api", + visible=True) guided_choice = gr.components.Textbox(value=kwargs['guided_choice'], - label="guided_choice", - info="https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters-for-chat-api", - visible=True) + label="guided_choice", + info="https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters-for-chat-api", + visible=True) guided_grammar = gr.components.Textbox(value=kwargs['guided_grammar'], label="guided_grammar", info="https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters-for-chat-api", @@ -3420,24 +3420,31 @@ def get_model_lock_visible_list(visible_models1, all_possible_visible_models): visible_list.append(False) return visible_list - def set_visible_models(visible_models1, num_model_lock=0, all_possible_visible_models=None): + def set_visible_models(visible_models1, compare_checkbox1, visible_models_text1, num_model_lock=0, + all_possible_visible_models=None): if num_model_lock == 0: num_model_lock = 3 # 2 + 1 (which is dup of first) - ret_list = [gr.Textbox(visible=True)] * num_model_lock + ret_list = [gr.update(visible=True)] * num_model_lock + if not compare_checkbox1: + ret_list[1] = gr.update(visible=False) + # in case switched from lock to not + visible_models_text1 = 'off' else: assert isinstance(all_possible_visible_models, list) assert num_model_lock == len(all_possible_visible_models) visible_list = [False, False] + get_model_lock_visible_list(visible_models1, all_possible_visible_models) - ret_list = [gr.Textbox(visible=x) for x in visible_list] + ret_list = [gr.update(visible=x) for x in visible_list] + ret_list.insert(0, visible_models_text1) + ret_list.insert(0, gr.update(visible=visible_models_text1 == 'on')) return tuple(ret_list) visible_models_func = functools.partial(set_visible_models, num_model_lock=len(text_outputs), all_possible_visible_models=kwargs['all_possible_visible_models']) visible_models.change(fn=visible_models_func, - inputs=visible_models, - outputs=[text_output, text_output2] + text_outputs, + inputs=[visible_models, compare_checkbox], + outputs=[visible_models, visible_models_text, text_output, text_output2] + text_outputs, ).then(**save_auth_kwargs) def add_langchain_mode(db1s, selection_docs_state1, requests_state1, langchain_mode1, y, @@ -4220,9 +4227,11 @@ def evaluate_nochat(*args1, default_kwargs1=None, str_api=False, plain_api=False # below works for both list and string for any reasonable string of image that's been byte encoded with b' to start or as file name image_file_check = args_list[eval_func_param_names.index('image_file')] - save_dict['image_file_present'] = isinstance(image_file_check, (str, list, tuple)) and len(image_file_check) >= 1 + save_dict['image_file_present'] = isinstance(image_file_check, (str, list, tuple)) and len( + image_file_check) >= 1 text_context_list_check = args_list[eval_func_param_names.index('text_context_list')] - save_dict['text_context_list_present'] = isinstance(text_context_list_check, (list, tuple)) and len(text_context_list_check) >= 1 + save_dict['text_context_list_present'] = isinstance(text_context_list_check, (list, tuple)) and len( + text_context_list_check) >= 1 if str_api and plain_api: save_dict['which_api'] = 'str_plain_api' From f2698b7cf7d48b24cfe4806bb8dd929378c626c7 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Tue, 9 Apr 2024 10:22:53 -0700 Subject: [PATCH 16/43] Fix args --- src/gradio_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gradio_runner.py b/src/gradio_runner.py index 8a97a605e..7d5ef0ffd 100644 --- a/src/gradio_runner.py +++ b/src/gradio_runner.py @@ -3443,7 +3443,7 @@ def set_visible_models(visible_models1, compare_checkbox1, visible_models_text1, num_model_lock=len(text_outputs), all_possible_visible_models=kwargs['all_possible_visible_models']) visible_models.change(fn=visible_models_func, - inputs=[visible_models, compare_checkbox], + inputs=[visible_models, compare_checkbox, visible_models_text], outputs=[visible_models, visible_models_text, text_output, text_output2] + text_outputs, ).then(**save_auth_kwargs) From aec2478b06dad721a10f540d3f58394ec748eaa0 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Tue, 9 Apr 2024 11:04:02 -0700 Subject: [PATCH 17/43] Add missing args to grclient query function --- gradio_utils/grclient.py | 86 ++++++++++++------ src/evaluate_params.py | 183 ++++++++++++++++++++------------------- 2 files changed, 153 insertions(+), 116 deletions(-) diff --git a/gradio_utils/grclient.py b/gradio_utils/grclient.py index b75eb2e20..7a106762d 100644 --- a/gradio_utils/grclient.py +++ b/gradio_utils/grclient.py @@ -37,9 +37,9 @@ class ReturnType(BaseModel): prompt_raw: str | None actual_llm: str | None text_context_list: list[str] | None = [] - input_tokens: Optional[int] = None - output_tokens: Optional[int] = None - tokens_per_second: Optional[float] = None + input_tokens: int = 0 + output_tokens: int = 0 + tokens_per_second: float = 0.0 try: @@ -122,7 +122,6 @@ def __init__( download_files: bool = True, # TODO: consider setting to False in 1.0 _skip_components: bool = True, # internal parameter to skip values certain components (e.g. State) that do not need to be displayed to users. ssl_verify: bool = True, - h2ogpt_key: str = None, persist: bool = False, check_hash: bool = True, @@ -163,7 +162,14 @@ def __init__( # 4.24.0: self._skip_components = _skip_components self.ssl_verify = ssl_verify - self.kwargs.update(dict(auth=auth, upload_files=upload_files, download_files=download_files, ssl_verify=ssl_verify)) + self.kwargs.update( + dict( + auth=auth, + upload_files=upload_files, + download_files=download_files, + ssl_verify=ssl_verify, + ) + ) self.verbose = verbose self.hf_token = hf_token @@ -256,7 +262,7 @@ def setup(self): self.api_url = urllib.parse.urljoin(self.src, utils.API_URL) if is_gradio_client_version7plus: self.protocol: Literal[ - "ws", "sse", "sse_v1", "sse_v2", "sse_v2.1" + "ws", "sse", "sse_v1", "sse_v2", "sse_v2.1" ] = self.config.get("protocol", "ws") self.sse_url = urllib.parse.urljoin( self.src, utils.SSE_URL_V0 if self.protocol == "sse" else utils.SSE_URL @@ -284,7 +290,9 @@ def setup(self): self._refresh_heartbeat = threading.Event() self._kill_heartbeat = threading.Event() - self.heartbeat = threading.Thread(target=self._stream_heartbeat, daemon=True) + self.heartbeat = threading.Thread( + target=self._stream_heartbeat, daemon=True + ) self.heartbeat.start() self.server_hash = self.get_server_hash() @@ -344,7 +352,9 @@ def get_server_hash(self): return ret finally: if self.verbose: - print("duration server_hash: %s %s" % (time.time() - t0, ret), flush=True) + print( + "duration server_hash: %s %s" % (time.time() - t0, ret), flush=True + ) def refresh_client_if_should(self): if self.config is None: @@ -584,7 +594,29 @@ def get_client_kwargs(self, **kwargs): ).items() } diff = set(eval_func_param_names).difference(fun_kwargs) - assert len(diff) == 0, "Add entries: %s" % diff + assert len(diff) == 0, ( + "Add query_or_summarize_or_extract entries: %s" % diff + ) + + extra_query_params = [ + "file", + "bad_error_string", + "print_info", + "asserts", + "url", + "prompt_extraction", + "model", + "text", + "print_error", + "pre_prompt_extraction", + "embed", + "print_warning", + "sanitize_llm", + ] + diff = set(fun_kwargs).difference( + eval_func_param_names + extra_query_params + ) + assert len(diff) == 0, "Add eval_func_params entries: %s" % diff return client_kwargs @@ -656,13 +688,11 @@ def query_or_summarize_or_extract( metadata_in_context: list = [], image_file: Union[str, list] = None, image_control: str = None, - - response_format: str = 'text', - guided_json: Union[str, dict] = '', - guided_regex: str = '', - guided_choice: str = '', - guided_grammar: str = '', - + response_format: str = "text", + guided_json: Union[str, dict] = "", + guided_regex: str = "", + guided_choice: str = "", + guided_grammar: str = "", prompt_type: Union[int, str] = None, prompt_dict: Dict = None, jq_schema=".[]", @@ -971,9 +1001,7 @@ def query_or_summarize_or_extract( prompt_raw = res_dict.get("prompt_raw", "") try: - actual_llm = res_dict["save_dict"][ - "base_model" - ] # fast path + actual_llm = res_dict["save_dict"]["base_model"] # fast path except Exception as e: print_warning( f"Unable to access save_dict to get actual_llm: {str(e)}" @@ -1156,11 +1184,13 @@ def query_or_summarize_or_extract( raise else: # both Anthopic and openai gives this kind of error, but h2oGPT only has retries for OpenAI - if 'Overloaded' in str(traceback.format_tb(e.__traceback__)): + if "Overloaded" in str(traceback.format_tb(e.__traceback__)): sleep_time = 30 + 2 ** (trial + 1) else: sleep_time = 1 * trial - print_warning("trying again: %s in %s seconds" % (trial, sleep_time)) + print_warning( + "trying again: %s in %s seconds" % (trial, sleep_time) + ) time.sleep(sleep_time) finally: # in case server changed, update in case clone() @@ -1331,12 +1361,14 @@ def simple_stream( response = prompter.get_response( prompt_and_text, prompt=prompt, sanitize_bot_response=sanitize_bot_response ) - res_dict.update(dict( - response=response, - sources=sources, - error=strex, - response_no_refs=response, - )) + res_dict.update( + dict( + response=response, + sources=sources, + error=strex, + response_no_refs=response, + ) + ) yield res_dict return res_dict diff --git a/src/evaluate_params.py b/src/evaluate_params.py index 8b283ef98..6a3f4923b 100644 --- a/src/evaluate_params.py +++ b/src/evaluate_params.py @@ -1,95 +1,100 @@ -input_args_list = ['model_state', 'my_db_state', 'selection_docs_state', 'requests_state', 'roles_state'] +input_args_list = [ + "model_state", + "my_db_state", + "selection_docs_state", + "requests_state", + "roles_state", +] no_default_param_names = [ - 'instruction', - 'iinput', - 'context', - 'instruction_nochat', - 'iinput_nochat', - 'h2ogpt_key', + "instruction", + "iinput", + "context", + "instruction_nochat", + "iinput_nochat", + "h2ogpt_key", ] -gen_hyper0 = ['num_beams', - 'max_new_tokens', - 'min_new_tokens', - 'early_stopping', - 'max_time', - 'repetition_penalty', - 'num_return_sequences', - 'do_sample', - 'seed', - ] -gen_hyper = ['temperature', - 'top_p', - 'top_k', - 'penalty_alpha'] + gen_hyper0 -reader_names = ['image_audio_loaders', 'pdf_loaders', 'url_loaders', 'jq_schema', 'extract_frames', 'llava_prompt'] - -eval_func_param_names = ['instruction', - 'iinput', - 'context', - 'stream_output', - 'prompt_type', - 'prompt_dict'] + \ - gen_hyper + \ - ['chat', - 'instruction_nochat', - 'iinput_nochat', - 'langchain_mode', - 'add_chat_history_to_context', - 'langchain_action', - 'langchain_agents', - 'top_k_docs', - 'chunk', - 'chunk_size', - 'document_subset', - 'document_choice', - 'document_source_substrings', - 'document_source_substrings_op', - 'document_content_substrings', - 'document_content_substrings_op', - - 'pre_prompt_query', - 'prompt_query', - 'pre_prompt_summary', - 'prompt_summary', - 'hyde_llm_prompt', - 'system_prompt', - ] + \ - reader_names + \ - ['visible_models', - 'visible_image_models', - 'h2ogpt_key', - 'add_search_to_context', - - 'chat_conversation', - 'text_context_list', - 'docs_ordering_type', - 'min_max_new_tokens', - 'max_input_tokens', - 'max_total_input_tokens', - 'docs_token_handling', - 'docs_joiner', - 'hyde_level', - 'hyde_template', - 'hyde_show_only_final', - 'doc_json_mode', - 'metadata_in_context', - - 'chatbot_role', - 'speaker', - 'tts_language', - 'tts_speed', - - 'image_file', - 'image_control', +gen_hyper0 = [ + "num_beams", + "max_new_tokens", + "min_new_tokens", + "early_stopping", + "max_time", + "repetition_penalty", + "num_return_sequences", + "do_sample", + "seed", +] +gen_hyper = ["temperature", "top_p", "top_k", "penalty_alpha"] + gen_hyper0 +reader_names = [ + "image_audio_loaders", + "pdf_loaders", + "url_loaders", + "jq_schema", + "extract_frames", + "llava_prompt", +] - 'response_format', - 'guided_json', - 'guided_regex', - 'guided_choice', - 'guided_grammar', - ] +eval_func_param_names = ( + ["instruction", "iinput", "context", "stream_output", "prompt_type", "prompt_dict"] + + gen_hyper + + [ + "chat", + "instruction_nochat", + "iinput_nochat", + "langchain_mode", + "add_chat_history_to_context", + "langchain_action", + "langchain_agents", + "top_k_docs", + "chunk", + "chunk_size", + "document_subset", + "document_choice", + "document_source_substrings", + "document_source_substrings_op", + "document_content_substrings", + "document_content_substrings_op", + "pre_prompt_query", + "prompt_query", + "pre_prompt_summary", + "prompt_summary", + "hyde_llm_prompt", + "system_prompt", + ] + + reader_names + + [ + "visible_models", + "visible_image_models", + "h2ogpt_key", + "add_search_to_context", + "chat_conversation", + "text_context_list", + "docs_ordering_type", + "min_max_new_tokens", + "max_input_tokens", + "max_total_input_tokens", + "docs_token_handling", + "docs_joiner", + "hyde_level", + "hyde_template", + "hyde_show_only_final", + "doc_json_mode", + "metadata_in_context", + "chatbot_role", + "speaker", + "tts_language", + "tts_speed", + "image_file", + "image_control", + "response_format", + "guided_json", + "guided_regex", + "guided_choice", + "guided_grammar", + ] +) # form evaluate defaults for submit_nochat_api eval_func_param_names_defaults = eval_func_param_names.copy() @@ -97,9 +102,9 @@ if k in eval_func_param_names_defaults: eval_func_param_names_defaults.remove(k) -eval_extra_columns = ['prompt', 'response', 'score'] +eval_extra_columns = ["prompt", "response", "score"] # override default_kwargs if user_kwargs None for args evaluate() uses that are not just in model_state # ensure prompt_type consistent with prep_bot(), so nochat API works same way # see how default_kwargs is set in gradio_runner.py -key_overrides = ['prompt_type', 'prompt_dict'] +key_overrides = ["prompt_type", "prompt_dict"] From 7d580c23e70c5875e2dbb0c928321d1d3fd99526 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Tue, 9 Apr 2024 11:15:33 -0700 Subject: [PATCH 18/43] Remove unused arg --- gradio_utils/grclient.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/gradio_utils/grclient.py b/gradio_utils/grclient.py index 7a106762d..856e1b476 100644 --- a/gradio_utils/grclient.py +++ b/gradio_utils/grclient.py @@ -696,7 +696,6 @@ def query_or_summarize_or_extract( prompt_type: Union[int, str] = None, prompt_dict: Dict = None, jq_schema=".[]", - llava_model: str = None, llava_prompt: str = "auto", image_audio_loaders: list = None, url_loaders: list = None, @@ -842,9 +841,6 @@ def query_or_summarize_or_extract( :param extract_frames: How many unique frames to extract from video (if 0, then just do audio if audio type file as well) - :param llava_model: IP:port for h2oai version of LLaVa gradio server for hosted image chat - E.g. http://192.168.1.46:7861 - None means no such LLaVa support :param llava_prompt: Prompt passed to LLaVa for querying the image :param image_audio_loaders: which loaders to use for image and audio parsing (None means default) From 7aa70d1f2d2923a059894f24287d99a3e12ca280 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Tue, 9 Apr 2024 15:42:01 -0700 Subject: [PATCH 19/43] Check and version --- .gitattributes | 1 + src/gradio_runner.py | 18 +++++++++--------- src/utils.py | 12 ++++++++++++ tests/test_utils.py | 9 ++++++++- 4 files changed, 30 insertions(+), 10 deletions(-) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..efce641ee --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +src/utils.py export-subst \ No newline at end of file diff --git a/src/gradio_runner.py b/src/gradio_runner.py index 7d5ef0ffd..b0150cddb 100644 --- a/src/gradio_runner.py +++ b/src/gradio_runner.py @@ -2395,6 +2395,15 @@ def add_role_func(name, file, mic, roles1, use_mic): system_btn3 = gr.Button(value='Get Hash', visible=not is_public, size='sm') system_text3 = gr.Textbox(label='Hash', interactive=False, visible=not is_public, show_copy_button=True) + def get_hash(): + return kwargs['git_hash'] + + system_event = system_btn3.click(get_hash, + outputs=system_text3, + api_name='system_hash' if allow_api else None, + **noqueue_kwargs, + ) + system_btn4 = gr.Button(value='Get Model Names', visible=not is_public, size='sm') system_text4 = gr.Textbox(label='Model Names', interactive=False, visible=not is_public, show_copy_button=True) @@ -6325,15 +6334,6 @@ def get_system_info_dict(system_input1, **kwargs1): **noqueue_kwargs, # queue to avoid spam ) - def get_hash(): - return kwargs['git_hash'] - - system_event = system_btn3.click(get_hash, - outputs=system_text3, - api_name='system_hash' if allow_api else None, - **noqueue_kwargs, - ) - def get_model_names(): key_list = ['base_model', 'prompt_type', 'prompt_dict'] + list(kwargs['other_model_state_defaults'].keys()) # don't want to expose backend inference server IP etc. diff --git a/src/utils.py b/src/utils.py index c4b91f291..57f6fa29c 100644 --- a/src/utils.py +++ b/src/utils.py @@ -404,6 +404,13 @@ def get_githash(): githash = f.read() except: githash = "GET_GITHASH" + + if githash == "GET_GITHASH": + try: + githash = $Format:%H$ + except: + pass + return githash @@ -2116,6 +2123,11 @@ def is_uuid4(string): return bool(pattern.match(string)) +def is_full_git_hash(s): + # This regex checks for exactly 40 hexadecimal characters. + return bool(re.fullmatch(r'[0-9a-f]{40}', s)) + + def get_show_username(username1): if split_google in username1: show_username = split_google.join(username1.split(split_google)[0:1]) diff --git a/tests/test_utils.py b/tests/test_utils.py index c907376ec..08e700572 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -5,7 +5,7 @@ from src.enums import invalid_json_str from src.gen import apply_chat_template from src.utils import get_list_or_str, read_popen_pipes, get_token_count, reverse_ucurve_list, undo_reverse_ucurve_list, \ - is_uuid4, has_starting_code_block, extract_code_block_content, looks_like_json, get_json + is_uuid4, has_starting_code_block, extract_code_block_content, looks_like_json, get_json, is_full_git_hash from tests.utils import wrap_test_forked import subprocess as sp @@ -205,6 +205,13 @@ def test_is_uuid4(): assert [is_uuid4(s) for s in test_strings] == [True, False, False, False] +def test_is_git_hash(): + # Example usage: + hashes = ["1a3b5c7d9e1a3b5c7d9e1a3b5c7d9e1a3b5c7d9e", "1G3b5c7d9e1a3b5c7d9e1a3b5c7d9e1a3b5c7d9e", "1a3b5c7d"] + + assert [is_full_git_hash(h) for h in hashes] == [True, False, False] + + def test_chat_template(): instruction = "Who are you?" system_prompt = "Be kind" From cf704720e22c342c1c4e6bef3825c1c879376d22 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Tue, 9 Apr 2024 15:53:01 -0700 Subject: [PATCH 20/43] Test commit hook --- src/version.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 src/version.py diff --git a/src/version.py b/src/version.py new file mode 100644 index 000000000..a15c866b4 --- /dev/null +++ b/src/version.py @@ -0,0 +1 @@ +__version__ = "0dbb5ce6" From c7c9003db36f3801b01e00b38b2121d354b91e31 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Tue, 9 Apr 2024 15:53:25 -0700 Subject: [PATCH 21/43] Test commit hook --- .gitattributes | 1 - src/utils.py | 3 ++- src/version.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitattributes b/.gitattributes index efce641ee..e69de29bb 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +0,0 @@ -src/utils.py export-subst \ No newline at end of file diff --git a/src/utils.py b/src/utils.py index 57f6fa29c..a18a498b3 100644 --- a/src/utils.py +++ b/src/utils.py @@ -407,7 +407,8 @@ def get_githash(): if githash == "GET_GITHASH": try: - githash = $Format:%H$ + from src.version import __version__ + githash = __version__ except: pass diff --git a/src/version.py b/src/version.py index a15c866b4..bb1887237 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "0dbb5ce6" +__version__ = "65e486db" From d37493a32ef8646111ef8f321c321833f9ef2a45 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Tue, 9 Apr 2024 15:55:14 -0700 Subject: [PATCH 22/43] Test commit hook --- src/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/version.py b/src/version.py index bb1887237..3539943ce 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "65e486db" +__version__ = "88b54784bca657ca4f427b79b276ca66dcbea389" From 447fb2fb4f9f22c9ed5655d054834b622ec96335 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Tue, 9 Apr 2024 15:57:13 -0700 Subject: [PATCH 23/43] pre-commit hook to be placed in ./.git/hooks/pre-commit --- src/pre-commit | 16 ++++++++++++++++ src/version.py | 2 +- 2 files changed, 17 insertions(+), 1 deletion(-) create mode 100755 src/pre-commit diff --git a/src/pre-commit b/src/pre-commit new file mode 100755 index 000000000..85423ec32 --- /dev/null +++ b/src/pre-commit @@ -0,0 +1,16 @@ +#!/bin/sh + +# The path to the utils.py file relative to the root of the repository +FILE_PATH="src/version.py" + +# Get the current git commit hash +GITHASH=$(git rev-parse HEAD) + +# Update the __version__ variable in utils.py +# This uses a Perl one-liner to find the __version__ line and replace it with the current GITHASH +perl -pi -e "s/__version__ = \".*\"/__version__ = \"$GITHASH\"/" $FILE_PATH + +# Add the modified utils.py file to the commit +git add $FILE_PATH + +# End of script diff --git a/src/version.py b/src/version.py index 3539943ce..68acbe24e 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "88b54784bca657ca4f427b79b276ca66dcbea389" +__version__ = "cf65c2d02805d7c4f1147e6d3b11c5a1742bfa60" From 4ece6f0204fc1bc22ffa66be924fbbf945f5cf81 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Tue, 9 Apr 2024 16:47:32 -0700 Subject: [PATCH 24/43] Reset parent client prior to clone if hash inconsistent due to server change --- src/gen.py | 3 +++ src/gpt_langchain.py | 10 ++++++++++ src/utils.py | 3 +++ src/version.py | 2 +- 4 files changed, 17 insertions(+), 1 deletion(-) diff --git a/src/gen.py b/src/gen.py index 428f940fc..aa35dca96 100644 --- a/src/gen.py +++ b/src/gen.py @@ -4923,6 +4923,9 @@ def evaluate( break else: raise RuntimeError("Failed to get client: %s" % inference_server) + if isinstance(model, GradioClient) and not regenerate_gradio_clients and gr_client is not None: + if gr_client.server_hash != model.server_hash: + model.refresh_client() else: raise RuntimeError("No such inference_server %s" % inference_server) diff --git a/src/gpt_langchain.py b/src/gpt_langchain.py index 30b3ef4e7..40b243c97 100644 --- a/src/gpt_langchain.py +++ b/src/gpt_langchain.py @@ -832,6 +832,11 @@ def _call( if self.verbose: print("end _call", flush=True) self.use_gradio_return(res_dict, prompt) + + # ensure parent client is updated if remote server changed + if client.server_hash != self.client.server_hash: + self.client.refresh_client() + return ret else: text_callback = None @@ -903,6 +908,11 @@ def _call( if self.verbose: print("end _call", flush=True) self.use_gradio_return(res_dict, prompt) + + # ensure parent client is updated if remote server changed + if client.server_hash != self.client.server_hash: + self.client.refresh_client() + return ret def use_gradio_return(self, res_dict, prompt_raw): diff --git a/src/utils.py b/src/utils.py index a18a498b3..609a36a3b 100644 --- a/src/utils.py +++ b/src/utils.py @@ -412,6 +412,9 @@ def get_githash(): except: pass + if os.getenv('HARD_ASSERTS'): + assert is_full_git_hash(githash) + return githash diff --git a/src/version.py b/src/version.py index 68acbe24e..dc0cc4653 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "cf65c2d02805d7c4f1147e6d3b11c5a1742bfa60" +__version__ = "c411cee49c1b8f9420145812bc478ce5c518c261" From e52b9925bba6ee65499eee06c4b06c02d351db83 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Tue, 9 Apr 2024 16:49:11 -0700 Subject: [PATCH 25/43] More detail --- gradio_utils/grclient.py | 9 +++++---- src/version.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/gradio_utils/grclient.py b/gradio_utils/grclient.py index 856e1b476..fc24bab60 100644 --- a/gradio_utils/grclient.py +++ b/gradio_utils/grclient.py @@ -338,13 +338,14 @@ def get_endpoints(client, verbose=False): print("duration endpoints: %s" % (time.time() - t0), flush=True) def get_server_hash(self): - t0 = time.time() - if self.config is None: - self.setup() """ Get server hash using super without any refresh action triggered Returns: git hash of gradio server """ + t0 = time.time() + if self.config is None: + self.setup() + t1 = time.time() ret = "GET_GITHASH" try: if self.check_hash: @@ -353,7 +354,7 @@ def get_server_hash(self): finally: if self.verbose: print( - "duration server_hash: %s %s" % (time.time() - t0, ret), flush=True + "duration server_hash: %s full time: %s system_hash time: %s" % (ret, time.time() - t0, time.time() - t1), flush=True ) def refresh_client_if_should(self): diff --git a/src/version.py b/src/version.py index dc0cc4653..25d301499 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "c411cee49c1b8f9420145812bc478ce5c518c261" +__version__ = "c75a70f53668db5bc2696be77ed8dd0fde9317fd" From 7b7a8711fdd6b6b218f54b888b89d12cd78330cc Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Tue, 9 Apr 2024 17:05:40 -0700 Subject: [PATCH 26/43] Lock changes to avoid multi-thread race --- openai_server/backend.py | 6 ++++++ src/gen.py | 4 +++- src/gpt_langchain.py | 3 ++- src/version.py | 2 +- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/openai_server/backend.py b/openai_server/backend.py index 8867a7fbc..fc77019ac 100644 --- a/openai_server/backend.py +++ b/openai_server/backend.py @@ -4,6 +4,8 @@ import uuid from collections import deque +import filelock + from log import logger from openai_server.backend_utils import convert_messages_to_structure @@ -88,6 +90,10 @@ def get_client(user=None): client = get_gradio_client(user=user) elif hasattr(gradio_client, 'clone'): client = gradio_client.clone() + if client.get_server_hash() != gradio_client.server_hash: + os.makedirs('locks', exist_ok=True) + with filelock.FileLock(os.path.join('locks', 'openai_gradio_client.lock')): + gradio_client.refresh_client() else: print( "re-get to ensure concurrency ok, slower if API is large, for speed ensure gradio_utils/grclient.py exists.") diff --git a/src/gen.py b/src/gen.py index aa35dca96..373c978d7 100644 --- a/src/gen.py +++ b/src/gen.py @@ -14,6 +14,7 @@ from datetime import datetime from random import randint +import filelock import httpx import requests from requests import ConnectTimeout, JSONDecodeError @@ -4925,7 +4926,8 @@ def evaluate( raise RuntimeError("Failed to get client: %s" % inference_server) if isinstance(model, GradioClient) and not regenerate_gradio_clients and gr_client is not None: if gr_client.server_hash != model.server_hash: - model.refresh_client() + with filelock.FileLock(os.path.join('locks', 'gradio_client.lock')): + model.refresh_client() else: raise RuntimeError("No such inference_server %s" % inference_server) diff --git a/src/gpt_langchain.py b/src/gpt_langchain.py index 40b243c97..d66d5565c 100644 --- a/src/gpt_langchain.py +++ b/src/gpt_langchain.py @@ -911,7 +911,8 @@ def _call( # ensure parent client is updated if remote server changed if client.server_hash != self.client.server_hash: - self.client.refresh_client() + with filelock.FileLock(os.path.join('locks', 'gradio_client.lock')): + self.client.refresh_client() return ret diff --git a/src/version.py b/src/version.py index 25d301499..3c8637457 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "c75a70f53668db5bc2696be77ed8dd0fde9317fd" +__version__ = "63d69b757076e8278f8184d0bbcb276b70b78fb3" From 747ee32d3789863d6b918a1dde95772c9c884d36 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Tue, 9 Apr 2024 17:07:57 -0700 Subject: [PATCH 27/43] Try avoiding queue altogether --- src/gradio_runner.py | 2 +- src/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gradio_runner.py b/src/gradio_runner.py index b0150cddb..9af325165 100644 --- a/src/gradio_runner.py +++ b/src/gradio_runner.py @@ -2401,7 +2401,7 @@ def get_hash(): system_event = system_btn3.click(get_hash, outputs=system_text3, api_name='system_hash' if allow_api else None, - **noqueue_kwargs, + **noqueue_kwargs_curl, ) system_btn4 = gr.Button(value='Get Model Names', visible=not is_public, size='sm') diff --git a/src/version.py b/src/version.py index 3c8637457..1d5409951 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "63d69b757076e8278f8184d0bbcb276b70b78fb3" +__version__ = "f3ee0b8008530bb2cd4df4ab28030d592504b2c3" From 56fe6130691ab7a800985e388a8290733227a9ae Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Tue, 9 Apr 2024 17:15:54 -0700 Subject: [PATCH 28/43] cosmetics --- src/gradio_runner.py | 1 + src/version.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/gradio_runner.py b/src/gradio_runner.py index 9af325165..95ebef2b4 100644 --- a/src/gradio_runner.py +++ b/src/gradio_runner.py @@ -2395,6 +2395,7 @@ def add_role_func(name, file, mic, roles1, use_mic): system_btn3 = gr.Button(value='Get Hash', visible=not is_public, size='sm') system_text3 = gr.Textbox(label='Hash', interactive=False, visible=not is_public, show_copy_button=True) + def get_hash(): return kwargs['git_hash'] diff --git a/src/version.py b/src/version.py index 1d5409951..2a4c66ff4 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "f3ee0b8008530bb2cd4df4ab28030d592504b2c3" +__version__ = "d916663f67f7a2b3ff4f404b9c15c7c1a93b8405" From 251d78ef7c2bde40d29ba075505a7b43ba8fe9a3 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Tue, 9 Apr 2024 17:20:57 -0700 Subject: [PATCH 29/43] Fix missing break in try-except trial loop --- gradio_utils/grclient.py | 3 ++- src/version.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/gradio_utils/grclient.py b/gradio_utils/grclient.py index fc24bab60..683092306 100644 --- a/gradio_utils/grclient.py +++ b/gradio_utils/grclient.py @@ -395,9 +395,10 @@ def refresh_client(self): kwargs.pop("check_model_name", None) ntrials = 3 client = None - for trial in range(0, ntrials + 1): + for trial in range(0, ntrials): try: client = Client(*self.args, **kwargs) + break except ValueError as e: if trial >= ntrials: raise diff --git a/src/version.py b/src/version.py index 2a4c66ff4..3b858024d 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "d916663f67f7a2b3ff4f404b9c15c7c1a93b8405" +__version__ = "d2ef601d9f2ea68b8b321870eb135d9eeb0e3f58" From 3a223d4f8a5208130116c0ba6959d326e1f84d5e Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Tue, 9 Apr 2024 17:25:04 -0700 Subject: [PATCH 30/43] cosmetics --- src/gen.py | 2 +- src/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gen.py b/src/gen.py index 373c978d7..061dfa153 100644 --- a/src/gen.py +++ b/src/gen.py @@ -4094,7 +4094,7 @@ def evaluate( # don't repeat prompting if doing gradio server since inner prompting will handle json_vllm = False if not gradio_server and \ - response_format in ['json_object', 'json_code']: + response_format in ['json_object', 'json_code']: pre_instruction1 = '\nEnsure your entire response is outputted as a single piece of strict valid JSON text.\n\n' pre_instruction2 = '\nEnsure your entire response is outputted as strict valid JSON text inside a Markdown code block with the json language identifier.\n\n' if isinstance(guided_json, str): diff --git a/src/version.py b/src/version.py index 3b858024d..9a133c1e7 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "d2ef601d9f2ea68b8b321870eb135d9eeb0e3f58" +__version__ = "0f54a13de9754fc87a8558259edb5c8a40dcef68" From 168b5db72046907cc6fd76925aa9f40a4120f986 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Tue, 9 Apr 2024 17:32:10 -0700 Subject: [PATCH 31/43] Ensure server_hash is a real hash at least --- gradio_utils/grclient.py | 8 ++++++++ src/version.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/gradio_utils/grclient.py b/gradio_utils/grclient.py index 683092306..5dcf313e7 100644 --- a/gradio_utils/grclient.py +++ b/gradio_utils/grclient.py @@ -2,6 +2,7 @@ import concurrent import difflib +import re import threading import traceback import os @@ -337,6 +338,12 @@ def get_endpoints(client, verbose=False): if verbose: print("duration endpoints: %s" % (time.time() - t0), flush=True) + @staticmethod + def is_full_git_hash(s): + # This regex checks for exactly 40 hexadecimal characters. + return bool(re.fullmatch(r'[0-9a-f]{40}', s)) + + def get_server_hash(self): """ Get server hash using super without any refresh action triggered @@ -350,6 +357,7 @@ def get_server_hash(self): try: if self.check_hash: ret = super().submit(api_name="/system_hash").result() + assert self.is_full_git_hash(ret), "ret is not a full git hash" return ret finally: if self.verbose: diff --git a/src/version.py b/src/version.py index 9a133c1e7..4e0be0435 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "0f54a13de9754fc87a8558259edb5c8a40dcef68" +__version__ = "0f551dbd4e571bbfc68d28558039a24e92bb34fc" From 450deb322d9494f052db5f8dca9d064930ad4635 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Tue, 9 Apr 2024 17:35:24 -0700 Subject: [PATCH 32/43] cosmetics --- gradio_utils/grclient.py | 7 ++++--- src/version.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/gradio_utils/grclient.py b/gradio_utils/grclient.py index 5dcf313e7..989ab2e73 100644 --- a/gradio_utils/grclient.py +++ b/gradio_utils/grclient.py @@ -341,8 +341,7 @@ def get_endpoints(client, verbose=False): @staticmethod def is_full_git_hash(s): # This regex checks for exactly 40 hexadecimal characters. - return bool(re.fullmatch(r'[0-9a-f]{40}', s)) - + return bool(re.fullmatch(r"[0-9a-f]{40}", s)) def get_server_hash(self): """ @@ -362,7 +361,9 @@ def get_server_hash(self): finally: if self.verbose: print( - "duration server_hash: %s full time: %s system_hash time: %s" % (ret, time.time() - t0, time.time() - t1), flush=True + "duration server_hash: %s full time: %s system_hash time: %s" + % (ret, time.time() - t0, time.time() - t1), + flush=True, ) def refresh_client_if_should(self): diff --git a/src/version.py b/src/version.py index 4e0be0435..cd1321c1c 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "0f551dbd4e571bbfc68d28558039a24e92bb34fc" +__version__ = "9ee9f4a6b0d01e555e83d22b15666ea931195673" From f1c796c65e46cb6f083af574068b0dbd983a7385 Mon Sep 17 00:00:00 2001 From: Achraf Merzouki Date: Wed, 10 Apr 2024 14:38:28 -0400 Subject: [PATCH 33/43] Do not force hf_transfer when explicitly set to false --- src/gen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gen.py b/src/gen.py index 061dfa153..8d840ac46 100644 --- a/src/gen.py +++ b/src/gen.py @@ -33,7 +33,7 @@ except (PackageNotFoundError, AssertionError): have_hf_transfer = False -if have_hf_transfer: +if have_hf_transfer and not os.getenv('HF_HUB_ENABLE_HF_TRANSFER', 'None') == '0': os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1' os.environ['SCARF_NO_ANALYTICS'] = 'true' From 4f4b9ece9ca24121a8583a002776b3715984eae3 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Wed, 10 Apr 2024 02:04:53 -0700 Subject: [PATCH 34/43] Don't give system prompt to Danube, doesn't do well with either naked or pre-conversation system prompt, contaminates responses too much --- src/gen.py | 3 +++ src/prompter.py | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/gen.py b/src/gen.py index 8d840ac46..975d12067 100644 --- a/src/gen.py +++ b/src/gen.py @@ -3971,6 +3971,9 @@ def evaluate( # prefer use input from API over model state prompt_type = prompt_type or chosen_model_state['prompt_type'] prompt_dict = prompt_dict or chosen_model_state['prompt_dict'] + if prompt_type == unknown_prompt_type and chosen_model_state['prompt_type'] not in [None, '', unknown_prompt_type]: + prompt_type = chosen_model_state['prompt_type'] + prompt_dict = chosen_model_state['prompt_dict'] if base_model is None and not no_llm_ok: raise AssertionError(no_model_msg) diff --git a/src/prompter.py b/src/prompter.py index ab6fd9503..944177e0e 100644 --- a/src/prompter.py +++ b/src/prompter.py @@ -575,12 +575,12 @@ def get_prompt(prompt_type, prompt_dict, context, reduced, making_context, retur chat_turn_sep = eos elif prompt_type in [PromptType.danube.value, str(PromptType.danube.value), PromptType.danube.name]: - can_handle_system_prompt = True # so not part of pre-conversation + can_handle_system_prompt = False # so uses pre-conversation prompt_tokens = "<|prompt|>" answer_tokens = "<|answer|>" if system_prompt in [None, 'None', 'auto']: - system_prompt = "I am H2O-Danube, a conversational chat assistant developed by H2O.ai." - promptA = promptB = system_prompt if not reduced else '' + system_prompt = "" + promptA = promptB = '' PreInstruct = prompt_tokens PreInput = None PreResponse = answer_tokens From 91a3d48300f27f48a9d6e3de7137a0c84fdbc7f9 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Wed, 10 Apr 2024 10:54:26 -0700 Subject: [PATCH 35/43] Update JSON faq --- docs/FAQ.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/FAQ.md b/docs/FAQ.md index ceb48cea3..7502b7ab4 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -2,12 +2,13 @@ ### JSON mode and other Guided Generations for vLLM >= 0.4.0 -- [x] Can pass in `response_format='json_object'` at CLI or API or UI to get json. Works for most models even if don't support json mode directly, except smaller models like 1.8B Danube (many mistakes) or Google Gemma (one character mistakes). +- [x] Can pass in `response_format=json_object` at CLI or API or UI to get json with best effort for each model type. +- [x] Can pass in `response_format=json_code` at CLI or API or UI to get json via code block extraction and special prompting. Works for most models even if don't support json mode directly, except smaller models like 1.8B Danube (many mistakes) or Google Gemma (one character mistakes). - [x] Can pass `guided_json` to specify the schema that should be a spec form with type and properties. The actual json spec is inside properties. See [vLLM guide](https://github.com/vllm-project/vllm/blob/c64cf38673780544087af5ad5d3baf879a29220b/tests/entrypoints/test_openai_server.py#L28-L73). -- [x] If pass guided_json for vLLM >=0.4.0 instances, then strictly follows format including keys, types, etc. +- [x] If pass `guided_json` for vLLM >=0.4.0 and Anthropic Claude-3 instances (soon Google, OpenAI, MistralAI), then strictly follows format including keys, types, etc. - [x] Can pass separately guided_regex, guided_choice, guided_grammar for similar control. These only work for vLLM >= 0.4.0. -- [x] Handle old vLLM and other models that do not have json mode by using code blocks. Only small models like Danube and Google Gemma have issues. -- [x] Handle mistral and openai directly for json mode +- [x] Handle old vLLM and other models that do not have json mode by using `json_code` mode effectively. +- [x] When making JSON without guided_json schema, handle MistralAI and OpenAI directly using their JSON mode. h2oGPT in general uses guided_json like defined below to tell LLM the schema as part of prompt, unless vLLM >= 0.4.0 when this is provided directly to vLLM. Schemas like `guided_json` are not required for JSON mode, but to follow some schema it is required, and only vLLM >= 0.4.0 will strictly follow the schema due to guided generation using outlines package. From c37afe0397f92606e4ea600a0a1ba8fd57920fab Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Wed, 10 Apr 2024 12:55:10 -0700 Subject: [PATCH 36/43] Do not force hf_transfer when explicitly set to false --- src/gen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gen.py b/src/gen.py index 975d12067..6ce158471 100644 --- a/src/gen.py +++ b/src/gen.py @@ -33,7 +33,7 @@ except (PackageNotFoundError, AssertionError): have_hf_transfer = False -if have_hf_transfer and not os.getenv('HF_HUB_ENABLE_HF_TRANSFER', 'None') == '0': +if have_hf_transfer and os.getenv('HF_HUB_ENABLE_HF_TRANSFER', 'None') != '0': os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1' os.environ['SCARF_NO_ANALYTICS'] = 'true' From 3cd479e66027caf290b486b18a6d645686b5e295 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Mon, 15 Apr 2024 18:57:53 -0700 Subject: [PATCH 37/43] Fix test --- src/version.py | 2 +- tests/test_client_calls.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/version.py b/src/version.py index b24bc59ff..fb52ab8b5 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "824df94ddc650586754c1a05e51527e48c4e64db" +__version__ = "d16056de07e7eb7ac6f228090db2d5121300ffb3" diff --git a/tests/test_client_calls.py b/tests/test_client_calls.py index d708f4e5e..1baad31db 100644 --- a/tests/test_client_calls.py +++ b/tests/test_client_calls.py @@ -5125,7 +5125,7 @@ def test_guided_json(langchain_action, langchain_mode, response_format, base_mod prompt_query=prompt if not use_instruction else '', prompt_summary=prompt if not use_instruction else '', visible_models=base_model, - text_context_list=[] if langchain_action == LangChainAction.QUERY else [ + text_context_list=[] if langchain_action == LangChainAction.QUERY.value else [ 'Henry is a good AI scientist.'], stream_output=False, langchain_mode=langchain_mode, From a3dd61210bc3c06cdb3778521a7e33ca7965ac69 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Thu, 18 Apr 2024 23:56:53 -0700 Subject: [PATCH 38/43] Fix constraint file --- reqs_optional/reqs_constraints.txt | 10 ++-------- src/version.py | 2 +- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/reqs_optional/reqs_constraints.txt b/reqs_optional/reqs_constraints.txt index 502a15a3e..207ec2206 100644 --- a/reqs_optional/reqs_constraints.txt +++ b/reqs_optional/reqs_constraints.txt @@ -1,13 +1,7 @@ # ensure doesn't drift, e.g. Issue #1348 torch==2.2.1 -<<<<<<< HEAD -gradio==3.50.2 -gradio_client==0.6.1 -#gradio==4.26.0 -#gradio_client==0.15.1 -transformers==4.39.2 -======= +# gradio==3.50.2 +# gradio_client==0.6.1 gradio==4.26.0 gradio_client==0.15.1 transformers==4.40.0 ->>>>>>> main diff --git a/src/version.py b/src/version.py index b0e877e0a..da2070867 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "5ce9b5f18f13a69d028643fa54a2a9b10c3a8705" +__version__ = "5144ac2612e5da28d4b0749fa68049b7ba87e096" From 1dc91bd6ced0439af2805b8ac776d3fc509f7b68 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Thu, 18 Apr 2024 23:57:10 -0700 Subject: [PATCH 39/43] Fix constraint file 2 --- reqs_optional/reqs_constraints.txt | 8 ++++---- src/version.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/reqs_optional/reqs_constraints.txt b/reqs_optional/reqs_constraints.txt index 207ec2206..2869f0413 100644 --- a/reqs_optional/reqs_constraints.txt +++ b/reqs_optional/reqs_constraints.txt @@ -1,7 +1,7 @@ # ensure doesn't drift, e.g. Issue #1348 torch==2.2.1 -# gradio==3.50.2 -# gradio_client==0.6.1 -gradio==4.26.0 -gradio_client==0.15.1 +gradio==3.50.2 +gradio_client==0.6.1 +# gradio==4.26.0 +# gradio_client==0.15.1 transformers==4.40.0 diff --git a/src/version.py b/src/version.py index da2070867..3b990986b 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "5144ac2612e5da28d4b0749fa68049b7ba87e096" +__version__ = "a3dd61210bc3c06cdb3778521a7e33ca7965ac69" From 7fa5d4f998a3cb1dd76b98a3bfffac4e5593e597 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Mon, 29 Apr 2024 10:43:26 -0700 Subject: [PATCH 40/43] More for https://github.com/h2oai/h2ogpt/issues/1587 --- docs/FAQ.md | 6 ++++++ docs/README_offline.md | 20 ++++++++++---------- src/gen.py | 3 +++ src/version.py | 2 +- 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/docs/FAQ.md b/docs/FAQ.md index 522834443..ceea1c552 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -58,6 +58,12 @@ python generate.py --base_model=llama --model_path_llama=https://huggingface.co/ ``` and one should at least pass `max_seq_len` as well. This ensures accurate prompting using the Meta chat template. Note the download link just comes from picking the model in the model card's files section and clicking the up arrow then when the download file link is provided you can right click and copy that link. HF keep changing how they present the download file, so adapt as required. +To use offline, then do: +```bash +python generate.py --base_model=llama --model_path_llama=Meta-Llama-3-8B-Instruct.Q5_K_M.gguf --tokenizer_base_model=meta-llama/Meta-Llama-3-8B-Instruct --max_seq_len=8192 --gradio_offline_level=2 --share=False --add_disk_models_to_ui=False +``` +which assumes the model was downloaded to default location of `llamacpp_path`. This works for offline if previously used the earlier command that got the tokenizer. + ### Mixtral AWQ In our testing, most AWQ Mixtral builds are bad, e.g. `TheBloke/dolphin-2.7-mixtral-8x7b-AWQ` and `TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ`, generating repeats with RAG or no output at all. We only found one that [works well](https://huggingface.co/casperhansen/mixtral-instruct-awq). The vLLM options to run are: diff --git a/docs/README_offline.md b/docs/README_offline.md index f5389cad3..d870cbb55 100644 --- a/docs/README_offline.md +++ b/docs/README_offline.md @@ -10,14 +10,14 @@ To run offline, either do smart or manual way. E.g. ```bash # online do: -python generate.py --base_model=TheBloke/zephyr-7B-beta-GGUF --prompt_type=zephyr --max_seq_len=4096 +python generate.py --base_model=TheBloke/zephyr-7B-beta-GGUF --prompt_type=zephyr --max_seq_len=4096 --add_disk_models_to_ui=False # Then use h2oGPT as might normally for any tasks. # Once offline do: -TRANSFORMERS_OFFLINE=1 python generate.py --base_model=zephyr-7b-beta.Q5_K_M.gguf --prompt_type=zephyr --gradio_offline_level=2 --share=False +TRANSFORMERS_OFFLINE=1 python generate.py --base_model=zephyr-7b-beta.Q5_K_M.gguf --prompt_type=zephyr --gradio_offline_level=2 --share=False --add_disk_models_to_ui=False # or: -TRANSFORMERS_OFFLINE=1 python generate.py --base_model=llama --model_path_llama=zephyr-7b-beta.Q5_K_M.gguf --prompt_type=zephyr --gradio_offline_level=2 --share=False +TRANSFORMERS_OFFLINE=1 python generate.py --base_model=llama --model_path_llama=zephyr-7b-beta.Q5_K_M.gguf --prompt_type=zephyr --gradio_offline_level=2 --share=False --add_disk_models_to_ui=False # or if choosing in UI do (be sure to choose correct prompt_type too): -TRANSFORMERS_OFFLINE=1 python generate.py --gradio_offline_level=2 --share=False +TRANSFORMERS_OFFLINE=1 python generate.py --gradio_offline_level=2 --share=False --add_disk_models_to_ui=False ``` * Manual Download @@ -29,11 +29,11 @@ TRANSFORMERS_OFFLINE=1 python generate.py --gradio_offline_level=2 --share=False wget https://huggingface.co/TheBloke/zephyr-7B-beta-GGUF/resolve/main/zephyr-7b-beta.Q5_K_M.gguf?download=true -O llamacpp_path/zephyr-7b-beta.Q5_K_M.gguf # Then use normally for any tasks one expects to do offline. # Once offline do: -TRANSFORMERS_OFFLINE=1 python generate.py --base_model=zephyr-7b-beta.Q5_K_M.gguf --prompt_type=zephyr --gradio_offline_level=2 --share=False +TRANSFORMERS_OFFLINE=1 python generate.py --base_model=zephyr-7b-beta.Q5_K_M.gguf --prompt_type=zephyr --gradio_offline_level=2 --share=False --add_disk_models_to_ui=False # or: -TRANSFORMERS_OFFLINE=1 python generate.py --base_model=llama --model_path_llama=zephyr-7b-beta.Q5_K_M.gguf --prompt_type=zephyr --gradio_offline_level=2 --share=False +TRANSFORMERS_OFFLINE=1 python generate.py --base_model=llama --model_path_llama=zephyr-7b-beta.Q5_K_M.gguf --prompt_type=zephyr --gradio_offline_level=2 --share=False --add_disk_models_to_ui=False # or if choosing in UI do (be sure to choose correct prompt_type too): -TRANSFORMERS_OFFLINE=1 python generate.py --gradio_offline_level=2 --share=False +TRANSFORMERS_OFFLINE=1 python generate.py --gradio_offline_level=2 --share=False --add_disk_models_to_ui=False ``` NOTE: If set `--prepare_offline_level=2` for first online call, h2oGPT will get standard models for offline use, but that may be more than you require. You can tune the code `../src/prepare_offline.py` to get only the models you require. @@ -55,7 +55,7 @@ If you can run on same (or better) system that will be like that in offline mode * `~/.cache/selenium/` * `~/nltk_data/` ```bash -python generate.py --score_model=None --gradio_size=small --model_lock="[{'base_model': 'h2oai/h2ogpt-4096-llama2-7b-chat'}]" --save_dir=save_fastup_chat --prepare_offline_level=2 +python generate.py --score_model=None --gradio_size=small --model_lock="[{'base_model': 'h2oai/h2ogpt-4096-llama2-7b-chat'}]" --save_dir=save_fastup_chat --prepare_offline_level=2 --add_disk_models_to_ui=False # below are already in docker python -m nltk.downloader all playwright install --with-deps @@ -162,7 +162,7 @@ Otherwise, vLLM will try to contact Hugging Face servers. You can also do same for h2oGPT, but take note that if you pass absolute path for base model, you have to specify the `--prompt_type`. ```bash -python generate.py --inference_server="vllm:0.0.0.0:5000" --base_model='$HOME/.cache/huggingface/hub/models--meta-llama--Llama-2-13b-chat-hf/snapshots/c2f3ec81aac798ae26dcc57799a994dfbf521496' --score_model=None --langchain_mode='UserData' --user_path=user_path --use_auth_token=True --max_seq_len=4096 --max_max_new_tokens=2048 --concurrency_count=64 --batch_size=16 --prompt_type=llama2 +python generate.py --inference_server="vllm:0.0.0.0:5000" --base_model='$HOME/.cache/huggingface/hub/models--meta-llama--Llama-2-13b-chat-hf/snapshots/c2f3ec81aac798ae26dcc57799a994dfbf521496' --score_model=None --langchain_mode='UserData' --user_path=user_path --use_auth_token=True --max_seq_len=4096 --max_max_new_tokens=2048 --concurrency_count=64 --batch_size=16 --prompt_type=llama2 --add_disk_models_to_ui=False ``` ### Disable access or port @@ -187,6 +187,6 @@ This is automatically done if using `linux_install.sh` or `linux_install_full.sh To avoid h2oGPT monitoring which elements are clicked in UI, set the ENV `H2OGPT_ENABLE_HEAP_ANALYTICS=False` or pass ```bash -python generate.py --enable-heap-analytics=False +python generate.py --enable-heap-analytics=False ... ``` Note that no data or user inputs are included, only raw svelte UI element IDs and nothing from the user inputs or data. diff --git a/src/gen.py b/src/gen.py index bc58bc58c..b2a92ad58 100644 --- a/src/gen.py +++ b/src/gen.py @@ -6506,6 +6506,9 @@ def get_on_disk_models(llamacpp_path, use_auth_token, trust_remote_code): text_hf_models.append(x) except Exception as e: print("No loading model %s because %s" % (x, str(e))) + if 'Checkout your internet connection' in str(e): + # do not continue if no internet + break print("End auto-detect HF cache text generation models", flush=True) print("Begin auto-detect llama.cpp models", flush=True) diff --git a/src/version.py b/src/version.py index 9b9e57a16..e8e8a0ccd 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "efca16c684c60af2aae3a64d491a3d5521d35ee5" +__version__ = "22c259fc47f31988d51ee01aeb5ecc8937561d63" From bc2a1c3fbbe95b8075b3a2c4fc19ceaf169c7590 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Mon, 20 May 2024 09:36:18 -0700 Subject: [PATCH 41/43] Fix --- reqs_optional/reqs_constraints.txt | 6 ------ src/version.py | 2 +- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/reqs_optional/reqs_constraints.txt b/reqs_optional/reqs_constraints.txt index 7298ef09c..53b9efbcb 100644 --- a/reqs_optional/reqs_constraints.txt +++ b/reqs_optional/reqs_constraints.txt @@ -1,13 +1,7 @@ # ensure doesn't drift, e.g. Issue #1348 torch==2.2.1 -<<<<<<< HEAD gradio==3.50.2 gradio_client==0.6.1 # gradio==4.26.0 # gradio_client==0.15.1 -transformers==4.40.0 -======= -gradio==4.26.0 -gradio_client==0.15.1 transformers==4.41.0 ->>>>>>> main diff --git a/src/version.py b/src/version.py index 7c5a8dac5..034bd3eb9 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "fdbbe49b227f9e7bada9d2b9eef99a11b6639da3" +__version__ = "3f8c3d42873e39854de4df934ae24e3d5b8179af" From e283159a08e7132fefcb8a7059effdfbf567eb85 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Mon, 27 May 2024 19:39:12 -0700 Subject: [PATCH 42/43] Resolve old --- docs/FAQ.md | 9 --------- reqs_optional/requirements_optional_langchain.txt | 6 +----- src/version.py | 2 +- 3 files changed, 2 insertions(+), 15 deletions(-) diff --git a/docs/FAQ.md b/docs/FAQ.md index 3ea1277b7..eefdc74c6 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -107,21 +107,12 @@ and you should at least pass `max_seq_len` as well. This ensures accurate promp To use offline, then do: ```bash -<<<<<<< HEAD -python generate.py --base_model=llama --model_path_llama=Meta-Llama-3-8B-Instruct.Q5_K_M.gguf --tokenizer_base_model=meta-llama/Meta-Llama-3-8B-Instruct --max_seq_len=8192 --gradio_offline_level=2 --share=False --add_disk_models_to_ui=False -``` -which assumes the model was downloaded to default location of `llamacpp_path`. This works for offline if previously used the earlier command that got the tokenizer. - -======= TRANSFORMERS_OFFLINE=1 python generate.py --base_model=llama --model_path_llama=Meta-Llama-3-8B-Instruct.Q5_K_M.gguf --tokenizer_base_model=meta-llama/Meta-Llama-3-8B-Instruct --max_seq_len=8192 --gradio_offline_level=2 --share=False --add_disk_models_to_ui=False ``` which assumes the model was downloaded to default location of `llamacpp_path`. This works for offline if previously used the earlier command that got the tokenizer. Note the chat template is defined by the model card's [tokenizer_config.json](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct/blob/main/tokenizer_config.json#L2053). -<<<<<<< HEAD ->>>>>>> main -======= Also, `--base_model` accepts a few forms of passing urls, TheBloke, etc. for GGUF, but not others. For more general GGUF locations, you should specify the file or url download link explicitly. E.g. for Phi: ```bash python generate.py --tokenizer_base_model=microsoft/Phi-3-mini-4k-instruct --base_model=llama --llama_cpp_model=https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf --max_seq_len=4096 diff --git a/reqs_optional/requirements_optional_langchain.txt b/reqs_optional/requirements_optional_langchain.txt index 77c757bf7..6316fd918 100644 --- a/reqs_optional/requirements_optional_langchain.txt +++ b/reqs_optional/requirements_optional_langchain.txt @@ -92,11 +92,7 @@ weaviate-client==3.26.2 # vllm==0.2.2 # only gradio>=4 -<<<<<<< HEAD -#gradio_pdf>=0.0.4 -======= -gradio_pdf>=0.0.7 ->>>>>>> main +#gradio_pdf>=0.0.7 gradio_tools>=0.0.9 diff --git a/src/version.py b/src/version.py index 06a74e808..27e75a545 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "357108ac651562325f2c903b164b2a4f46fe7067" +__version__ = "1805f106cb90e5833613b2da6f4024488c0c3eb4" From 5fe7db602bae18c110c937f014f3d237aa1eea05 Mon Sep 17 00:00:00 2001 From: "Jonathan C. McKinney" Date: Mon, 27 May 2024 19:39:53 -0700 Subject: [PATCH 43/43] Resolve old --- docs/FAQ.md | 1 - src/version.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/FAQ.md b/docs/FAQ.md index eefdc74c6..cb769619d 100644 --- a/docs/FAQ.md +++ b/docs/FAQ.md @@ -118,7 +118,6 @@ Also, `--base_model` accepts a few forms of passing urls, TheBloke, etc. for GGU python generate.py --tokenizer_base_model=microsoft/Phi-3-mini-4k-instruct --base_model=llama --llama_cpp_model=https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf --max_seq_len=4096 ``` ->>>>>>> main ### Mixtral AWQ In our testing, most AWQ Mixtral builds are bad, e.g. `TheBloke/dolphin-2.7-mixtral-8x7b-AWQ` and `TheBloke/Mixtral-8x7B-Instruct-v0.1-AWQ`, generating repeats with RAG or no output at all. We only found one that [works well](https://huggingface.co/casperhansen/mixtral-instruct-awq). The vLLM options to run are: diff --git a/src/version.py b/src/version.py index 27e75a545..b4ec77d0c 100644 --- a/src/version.py +++ b/src/version.py @@ -1 +1 @@ -__version__ = "1805f106cb90e5833613b2da6f4024488c0c3eb4" +__version__ = "e283159a08e7132fefcb8a7059effdfbf567eb85"