In [None]:
! pip install datasets evaluate gradio pillow transformers torch --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m81.9/84.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h

### Text Generation!

In [None]:
import torch
from transformers import pipeline
from PIL import Image
import gradio as gr

generator = pipeline("text-generation")
generator("C++ is ")

No model was supplied, defaulted to openai-community/gpt2 and revision 607a30d (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'C++ is \xa0the only language that supports Python and JavaScript syntax.\nHere we see that a simple string is an iterator over a list of strings. In Python, the iterator is called a tuple and the strings are wrapped in the iterator. In JavaScript, it\'s called a string and the strings are wrapped into the iterator. It doesn\'t matter if the iterator has a single or double index.\nIn this example, a string is wrapped in a tuple. In Python, it has a value, a tuple, and an iterator.\nHere\'s the function that takes a string and a string of its form:\ndef __iter__(self, value): return String.wrap(value)\nAs you can see, the list of strings is not immutable. You can use the iterator for any function or class.\nThis pattern is actually quite complex. The iterator is the only way to create a string. It\'s very simple. It\'s called a tuple. It\'s also called a string.\nIf you look at the code, it looks like this:\ndef __iter__(self, value):... if self.__iter__() { return "

#### Do more with pipeline - explore yourself!

In [None]:
from transformers import pipeline

generator = pipeline("text-generation", model="gpt2")
generator("C++ is",
            max_new_tokens=5,
            num_return_sequences=2,
        )

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'C++ is a tool for the compiler'},
 {'generated_text': 'C++ is still in beta, and'}]

**How to experiment.**


Two places -- while creating the [pipeline](https://github.com/huggingface/transformers/blob/v4.56.2/src/transformers/pipelines/__init__.py#L637) and while generating the actual output (text image etc.)


List of tasks that can be done with pipeline - figure this out. Experiment with each yourself.

### Images and pipeline

In [None]:
from transformers import pipeline

image_classifier = pipeline(
    task="image-classification", model="google/vit-base-patch16-224"
)
result = image_classifier(
    "https://media.istockphoto.com/id/1443562748/photo/cute-ginger-cat.jpg?s=1024x1024&w=is&k=20&c=QaEkKC7lFEBrzzPftMRBVuOZq4FNOnUjOV1VqTmpMfY="
)
print(result)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cpu


[{'label': 'Egyptian cat', 'score': 0.7064002752304077}, {'label': 'tabby, tabby cat', 'score': 0.126403346657753}, {'label': 'tiger cat', 'score': 0.11530809849500656}, {'label': 'Persian cat', 'score': 0.005241439677774906}, {'label': 'lynx, catamount', 'score': 0.003390116384252906}]


### Multimodal example

In [None]:
vqa_pipeline = pipeline(
    "visual-question-answering", model="Salesforce/blip-vqa-capfilt-large"
)

image = Image.open("baby_goat.jpg")
question = "Is there an elephant?"

vqa_pipeline(image, question, top_k=1)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Device set to use cpu


[{'answer': 'no'}]

In [None]:
def answer_question(image, question):
    if image is None or question.strip() == "":
        return "Please provide an image and a question."
    outputs = vqa_pipeline(image, question, top_k=1)
    return outputs[0]["answer"]


with gr.Blocks() as demo:
    gr.Markdown("# 🖼️ Visual Question Answering with BLIP")
    with gr.Row():
        with gr.Column():
            image_input = gr.Image(type="pil", label="Upload an Image")
            question_input = gr.Textbox(label="Enter your question")
            submit_btn = gr.Button("Get Answer")
        with gr.Column():
            output_text = gr.Textbox(label="Answer")

    submit_btn.click(
        fn=answer_question,
        inputs=[image_input, question_input],
        outputs=output_text
    )

# Launch the app
demo.launch(debug=True)


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://c5ac80b6366bc98728.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://c5ac80b6366bc98728.gradio.live


