From f51877fdc9ad5fe157de94f33d5dd05969e3a573 Mon Sep 17 00:00:00 2001 From: Adam Czyzewski Date: Tue, 9 Jul 2024 15:23:08 +0200 Subject: [PATCH] Add PaliGemma integration with Mesop. --- PaliGemma/Integrate_PaliGemma_with_Mesop.ipynb | 1 + README.md | 1 + 2 files changed, 2 insertions(+) create mode 100644 PaliGemma/Integrate_PaliGemma_with_Mesop.ipynb diff --git a/PaliGemma/Integrate_PaliGemma_with_Mesop.ipynb b/PaliGemma/Integrate_PaliGemma_with_Mesop.ipynb new file mode 100644 index 0000000..6b380d2 --- /dev/null +++ b/PaliGemma/Integrate_PaliGemma_with_Mesop.ipynb @@ -0,0 +1 @@ +{"cells":[{"cell_type":"markdown","metadata":{"id":"Tce3stUlHN0L"},"source":["##### Copyright 2024 Google LLC."]},{"cell_type":"code","execution_count":1,"metadata":{"cellView":"form","executionInfo":{"elapsed":2,"status":"ok","timestamp":1720489863172,"user":{"displayName":"Wei Wei","userId":"14738664724909855261"},"user_tz":-480},"id":"tuOe1ymfHZPu","tags":[]},"outputs":[],"source":["# @title Licensed under the Apache License, Version 2.0 (the \"License\");\n","# you may not use this file except in compliance with the License.\n","# You may obtain a copy of the License at\n","#\n","# https://www.apache.org/licenses/LICENSE-2.0\n","#\n","# Unless required by applicable law or agreed to in writing, software\n","# distributed under the License is distributed on an \"AS IS\" BASIS,\n","# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n","# See the License for the specific language governing permissions and\n","# limitations under the License."]},{"cell_type":"markdown","metadata":{"id":"dfsDR_omdNea"},"source":["# Integrating PaliGemma with Mesop\n","This notebook demonstrates how to use [PaliGemma](https://ai.google.dev/gemma/docs/paligemma) models with [Mesop](https://google.github.io/mesop/) to create a simple GUI application.\n","\n"," \n","
\n"," Run in Google Colab\n","
"]},{"cell_type":"markdown","metadata":{"id":"FaqZItBdeokU","tags":[]},"source":["## Setup\n","\n","### Select the Colab runtime\n","To complete this tutorial, you'll need to have a Colab runtime with sufficient resources to run the Gemma model. In this case, you can use a T4 GPU:\n","\n","1. In the upper-right of the Colab window, select **▾ (Additional connection options)**.\n","2. Select **Change runtime type**.\n","3. Under **Hardware accelerator**, select **T4 GPU**.\n","\n","### Gemma setup\n","\n","To complete this tutorial, you'll first need to complete the setup instructions at [Gemma setup](https://ai.google.dev/gemma/docs/setup). The Gemma setup instructions show you how to do the following:\n","\n","* Get access to Gemma on kaggle.com.\n","* Select a Colab runtime with sufficient resources to run\n"," the Gemma 2B model.\n","* Generate and configure a Kaggle username and an API key as Colab secrets.\n","\n","After you've completed the Gemma setup, move on to the next section, where you'll set environment variables for your Colab environment.\n"]},{"cell_type":"markdown","metadata":{"id":"CY2kGtsyYpHF"},"source":["### Configure your credentials\n","\n","Add your your Kaggle credentials to the Colab Secrets manager to securely store it.\n","\n","1. Open your Google Colab notebook and click on the 🔑 Secrets tab in the left panel. \"The\n","2. Create new secrets: `KAGGLE_USERNAME` and `KAGGLE_KEY`\n","3. Copy/paste your username into `KAGGLE_USERNAME`\n","3. Copy/paste your key into `KAGGLE_KEY`\n","4. Toggle the buttons on the left to allow notebook access to the secrets.\n"]},{"cell_type":"code","execution_count":2,"metadata":{"executionInfo":{"elapsed":7582,"status":"ok","timestamp":1720489870752,"user":{"displayName":"Wei Wei","userId":"14738664724909855261"},"user_tz":-480},"id":"A9sUQ4WrP-Yr","tags":[]},"outputs":[],"source":["import os\n","from google.colab import userdata\n","\n","# Note: `userdata.get` is a Colab API. If you're not using Colab, set the env\n","# vars as appropriate for your system.\n","os.environ[\"KAGGLE_USERNAME\"] = userdata.get(\"KAGGLE_USERNAME\")\n","os.environ[\"KAGGLE_KEY\"] = userdata.get(\"KAGGLE_KEY\")"]},{"cell_type":"markdown","metadata":{"id":"iwjo5_Uucxkw"},"source":["### Install dependencies\n","Run the cell below to install all the required dependencies."]},{"cell_type":"code","execution_count":3,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":63739,"status":"ok","timestamp":1720489934489,"user":{"displayName":"Wei Wei","userId":"14738664724909855261"},"user_tz":-480},"id":"r_nXPEsF7UWQ","outputId":"aaad234e-b907-47ae-e995-b0f1c2757456","tags":[]},"outputs":[{"name":"stdout","output_type":"stream","text":["\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m571.8/571.8 kB\u001b[0m \u001b[31m37.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m347.7/347.7 kB\u001b[0m \u001b[31m26.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.2/5.2 MB\u001b[0m \u001b[31m46.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m590.6/590.6 MB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.3/5.3 MB\u001b[0m \u001b[31m69.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.2/2.2 MB\u001b[0m \u001b[31m73.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m69.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n","tf-keras 2.15.1 requires tensorflow<2.16,>=2.15, but you have tensorflow 2.16.2 which is incompatible.\u001b[0m\u001b[31m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.9/77.9 kB\u001b[0m \u001b[31m1.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.2/43.2 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h Building wheel for ml_collections (setup.py) ... \u001b[?25l\u001b[?25hdone\n"]}],"source":["!pip install -q -U keras keras-nlp\n","!pip install -q overrides ml_collections \"einops~=0.7\" sentencepiece"]},{"cell_type":"markdown","metadata":{"id":"pOAEiJmnBE0D"},"source":["## Exploring prompting capabilities"]},{"cell_type":"markdown","metadata":{"id":"HdcJ0WgI_tb7"},"source":["### PaliGemma"]},{"cell_type":"markdown","metadata":{"id":"DfMHtnStiIh-"},"source":["PaliGemma is a lightweight open vision-language model (VLM) inspired by PaLI-3, and based on open components like the SigLIP vision model and the Gemma language model. PaliGemma takes both images and text as inputs and can answer questions about images with detail and context, meaning that PaliGemma can perform deeper analysis of images and provide useful insights, such as captioning for images and short videos, object detection, and reading text embedded within images.\n","\n","Prompting:\n","\n","* `cap {lang}\\n`: Very raw short caption (from WebLI-alt)\n","* `caption {lang}\\n`: Nice, COCO-like short captions\n","* `describe {lang}\\n`: Somewhat longer, more descriptive captions\n","* `ocr`: Optical character recognition\n","* `answer en {question}\\n`: Question answering about the image contents\n","* `question {lang} {answer}\\n`: Question generation for a given answer\n","* `detect {object} ; {object}\\n`: Count objects in a scene and return the bounding boxes for the objects\n","* `segment {object}\\n`: Do image segmentation of the object in the scene"]},{"cell_type":"code","execution_count":4,"metadata":{"executionInfo":{"elapsed":3499,"status":"ok","timestamp":1720489937975,"user":{"displayName":"Wei Wei","userId":"14738664724909855261"},"user_tz":-480},"id":"twvbOLey_3tW"},"outputs":[],"source":["import os\n","import sys\n","import keras\n","import keras_nlp\n","\n","keras.config.set_floatx(\"bfloat16\")"]},{"cell_type":"code","execution_count":5,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":96001,"status":"ok","timestamp":1720490033973,"user":{"displayName":"Wei Wei","userId":"14738664724909855261"},"user_tz":-480},"id":"IuQlLU09_qb3","outputId":"d9d15715-fbb0-4c6a-800a-bb4f0f366811"},"outputs":[{"name":"stderr","output_type":"stream","text":["Downloading from https://www.kaggle.com/api/v1/models/keras/paligemma/keras/pali_gemma_3b_mix_224/1/download/model.safetensors...\n","Downloading from https://www.kaggle.com/api/v1/models/keras/paligemma/keras/pali_gemma_3b_mix_224/1/download/model.safetensors.index.json...\n","Downloading from https://www.kaggle.com/api/v1/models/keras/paligemma/keras/pali_gemma_3b_mix_224/1/download/metadata.json...\n","100%|██████████| 143/143 [00:00<00:00, 112kB/s]\n","Downloading from https://www.kaggle.com/api/v1/models/keras/paligemma/keras/pali_gemma_3b_mix_224/1/download/task.json...\n","Downloading from https://www.kaggle.com/api/v1/models/keras/paligemma/keras/pali_gemma_3b_mix_224/1/download/config.json...\n","100%|██████████| 861/861 [00:00<00:00, 1.19MB/s]\n","Downloading from https://www.kaggle.com/api/v1/models/keras/paligemma/keras/pali_gemma_3b_mix_224/1/download/model.safetensors...\n","Downloading from https://www.kaggle.com/api/v1/models/keras/paligemma/keras/pali_gemma_3b_mix_224/1/download/model.safetensors.index.json...\n","Downloading from https://www.kaggle.com/api/v1/models/keras/paligemma/keras/pali_gemma_3b_mix_224/1/download/model.weights.h5...\n","100%|██████████| 5.45G/5.45G [01:15<00:00, 77.7MB/s]\n","Downloading from https://www.kaggle.com/api/v1/models/keras/paligemma/keras/pali_gemma_3b_mix_224/1/download/model.safetensors...\n","Downloading from https://www.kaggle.com/api/v1/models/keras/paligemma/keras/pali_gemma_3b_mix_224/1/download/model.safetensors.index.json...\n","Downloading from https://www.kaggle.com/api/v1/models/keras/paligemma/keras/pali_gemma_3b_mix_224/1/download/preprocessor.json...\n","Downloading from https://www.kaggle.com/api/v1/models/keras/paligemma/keras/pali_gemma_3b_mix_224/1/download/tokenizer.json...\n","100%|██████████| 410/410 [00:00<00:00, 471kB/s]\n","Downloading from https://www.kaggle.com/api/v1/models/keras/paligemma/keras/pali_gemma_3b_mix_224/1/download/assets/tokenizer/vocabulary.spm...\n","100%|██████████| 4.07M/4.07M [00:00<00:00, 15.8MB/s]\n"]}],"source":["# Load PaliGemma\n","paligemma = keras_nlp.models.PaliGemmaCausalLM.from_preset(\"pali_gemma_3b_mix_224\")"]},{"cell_type":"code","execution_count":6,"metadata":{"executionInfo":{"elapsed":1153,"status":"ok","timestamp":1720490035122,"user":{"displayName":"Wei Wei","userId":"14738664724909855261"},"user_tz":-480},"id":"ewNkh5lE-UAt"},"outputs":[],"source":["if not os.path.exists(\"big_vision_repo\"):\n"," !git clone --quiet --branch=main --depth=1 \\\n"," https://github.com/google-research/big_vision big_vision_repo\n","\n","# Append big_vision code to python import path\n","if \"big_vision_repo\" not in sys.path:\n"," sys.path.append(\"big_vision_repo\")"]},{"cell_type":"code","execution_count":7,"metadata":{"executionInfo":{"elapsed":15,"status":"ok","timestamp":1720490035122,"user":{"displayName":"Wei Wei","userId":"14738664724909855261"},"user_tz":-480},"id":"ubRH-t77AJIx"},"outputs":[],"source":["import io\n","import re\n","import PIL\n","import requests\n","import numpy as np\n","from PIL import Image\n","import matplotlib.pyplot as plt\n","import matplotlib.patches as patches\n","import big_vision.evaluators.proj.paligemma.transfers.segmentation as segeval\n","\n","# Helpers:\n","\n","\n","def crop_and_resize(image, target_size):\n"," \"\"\"A helper function the resizes given image to the given shape\"\"\"\n"," width, height = image.size\n"," source_size = min(image.size)\n"," left = width // 2 - source_size // 2\n"," top = height // 2 - source_size // 2\n"," right, bottom = left + source_size, top + source_size\n"," return image.resize(target_size, box=(left, top, right, bottom))\n","\n","\n","def read_image(url, target_size=(224, 224)):\n"," \"\"\"Loads images from URL\"\"\"\n"," headers = {\"User-Agent\": \"My User Agent 1.0\"}\n"," contents = io.BytesIO(requests.get(url, headers=headers, stream=True).content)\n"," image = Image.open(contents)\n"," image = crop_and_resize(image, target_size)\n"," image = np.array(image)\n","\n"," # Remove alpha channel if neccessary.\n"," if image.shape[2] == 4:\n"," image = image[:, :, :3]\n"," return image\n","\n","\n","def parse_bbox_and_labels(detokenized_output: str):\n"," \"\"\"Parses model output to extract bounding boxes\"\"\"\n"," matches = re.finditer(\n"," \"\\d\\d\\d\\d)>\\d\\d\\d\\d)>\\d\\d\\d\\d)>\\d\\d\\d\\d)>\"\n"," \" (?P