## Pretrained VQA 
Vision-and-Language Transformer (ViLT), fine-tuned on VQAv2

Pretrained Model: https://huggingface.co/dandelin/vilt-b32-finetuned-vqa

In [1]:
from transformers import ViltProcessor, ViltForQuestionAnswering
import requests
from PIL import Image
from io import BytesIO
import os
import pandas as pd

## Code example

In [2]:
# Code Example

# prepare image + question
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
text = "How many cats are there?"

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# prepare inputs
encoding = processor(image, text, return_tensors="pt")

# forward pass
outputs = model(**encoding)
logits = outputs.logits
probabilities = logits.softmax(dim=-1)

# get the predicted class and its confidence
predicted_class = logits.argmax(-1).item()
confidence = probabilities[0, predicted_class].item()

print("Predicted answer:", model.config.id2label[predicted_class])
print("Confidence:", confidence)

Predicted answer: 2
Confidence: 0.9443891048431396


In [58]:
# Code Example

# prepare image + question
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
text = "Are there more than two cats?"

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# prepare inputs
encoding = processor(image, text, return_tensors="pt")

# forward pass
outputs = model(**encoding)
logits = outputs.logits
probabilities = logits.softmax(dim=-1)

# get the predicted class and its confidence
predicted_class = logits.argmax(-1).item()
confidence = probabilities[0, predicted_class].item()

print("Predicted answer:", model.config.id2label[predicted_class])
print("Confidence:", confidence)

Predicted answer: no
Confidence: 0.9986134767532349


In [59]:
# Code Example

# prepare image + question
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
text = "How many dogs are there?"

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# prepare inputs
encoding = processor(image, text, return_tensors="pt")

# forward pass
outputs = model(**encoding)
logits = outputs.logits
probabilities = logits.softmax(dim=-1)

# get the predicted class and its confidence
predicted_class = logits.argmax(-1).item()
confidence = probabilities[0, predicted_class].item()

print("Predicted answer:", model.config.id2label[predicted_class])
print("Confidence:", confidence)

Predicted answer: 0
Confidence: 0.9978933930397034


![image.png](attachment:image.png)
_____________
## Experiment

In [3]:
# slugging
def build_path(category:str, number_str:str, name:str):
    category = category.lower()
    name = name.lower().replace(" ", "_").replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("'", "_").replace("-", "_").replace(".", "")
    file_name = number_str + "_" + name + ".jpg"
    path = os.path.join("images", category, file_name)
    return path

In [4]:
# test
print(build_path("SWITZERLand", "19", "Jet d'EAU"))
print(build_path("SWITZERLand", "07", "KAPELLBRÜCKE"))

images\switzerland\19_jet_d_eau.jpg
images\switzerland\07_kapellbruecke.jpg


In [5]:
def vilt_vqa(url_or_path:str, question:str, online:bool):
    if online:
        image = Image.open(requests.get(url_or_path, stream=True).raw)
    else:
        image = Image.open(url_or_path)
    processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
    model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

    # prepare inputs
    encoding = processor(image, question, return_tensors="pt")

    # forward pass
    outputs = model(**encoding)
    logits = outputs.logits
    probabilities = logits.softmax(dim=-1)

    # get the predicted answer and its confidence
    predicted_class = logits.argmax(-1).item()
    answer = model.config.id2label[predicted_class]
    confidence = probabilities[0, predicted_class].item()
    confidence = round(confidence, 4)
    # print("Predicted answer:", answer)
    # print("Confidence:", confidence)
    return answer, confidence

In [6]:
# test
test_path = build_path("SWITZERLand", "02", "BundesHAUS")
test_answer = vilt_vqa(test_path, "Is there text on the building?", False)
test_answer

('yes', 0.4986)

In [8]:
switzerland_list = ["Matterhorn", "Bundeshaus", "Chateau de Chillon", "Creux du Van", "Bern Old City", 
                    "Jungfraujoch", "Kapellbrücke", "Ascona", "Grindelwald", "Chateau de Gruyeres", 
                    "Aletsch Glacier", "Lauterbrunnen", "Rheinfall", "Zermatt", "Lion Monument", 
                    "Schilthorn", "Lenzburg Castle", "Grossmünster","Jet d'Eau Lake Geneva", "Zytglogge"]

international_list = ["Eiffel Tower", "Uluru", "Louvre", "Lady Liberty", "Colosseum", "Niagara Falls", 
                      "Disney World", "Forbidden City", "Golden Gate Bridge", "Times Square", "Notre Dame",
                      "Sagrada Familia", "Neuschwanstein Castle", "Sydney Opera House", "Great Wall", 
                      "Florence Cathedral", "Mount Rushmore", "Brooklyn Bridge", "Grand Canyon", "Las Vegas Strip"]

grisons_list = ["Arosa", "Bergün", "Lai da Palpuogna", "samedan", "S-chanf", "Landwasser Viaduct", 
                "Brusio Spiral Viaduct", "Chur", "Lake Silvaplana", "St. Moritz", "Lake Cauma", 
                "White Turf", "Guarda", "Davos Schatzalp", "Ruinaulta", "Maienfeld", 
                "Muottas Muragl", "Oberengadin", "San Gian", "Corviglia Piz Nair"]

switzerland_paths = {}
for counter in range(len(switzerland_list)):
    if counter < 9:
        number_string = "0" + str(counter+1)
    else:
        number_string = str(counter+1)
    name = switzerland_list[counter]
    switzerland_paths[name] = build_path("switzerland", number_string, name)

international_paths = {}
for counter in range(len(international_list)):
    if counter < 9:
        number_string = "0" + str(counter+1)
    else:
        number_string = str(counter+1)
    name = international_list[counter]
    international_paths[name] = build_path("international", number_string, name)

grisons_paths = {}
for counter in range(len(grisons_list)):
    if counter < 9:
        number_string = "0" + str(counter+1)
    else:
        number_string = str(counter+1)
    name = grisons_list[counter]
    grisons_paths[name] = build_path("grisons", number_string, name)

In [9]:
# test
switzerland_paths

{'Matterhorn': 'images\\switzerland\\01_matterhorn.jpg',
 'Bundeshaus': 'images\\switzerland\\02_bundeshaus.jpg',
 'Chateau de Chillon': 'images\\switzerland\\03_chateau_de_chillon.jpg',
 'Creux du Van': 'images\\switzerland\\04_creux_du_van.jpg',
 'Bern Old City': 'images\\switzerland\\05_bern_old_city.jpg',
 'Jungfraujoch': 'images\\switzerland\\06_jungfraujoch.jpg',
 'Kapellbrücke': 'images\\switzerland\\07_kapellbruecke.jpg',
 'Ascona': 'images\\switzerland\\08_ascona.jpg',
 'Grindelwald': 'images\\switzerland\\09_grindelwald.jpg',
 'Chateau de Gruyeres': 'images\\switzerland\\10_chateau_de_gruyeres.jpg',
 'Aletsch Glacier': 'images\\switzerland\\11_aletsch_glacier.jpg',
 'Lauterbrunnen': 'images\\switzerland\\12_lauterbrunnen.jpg',
 'Rheinfall': 'images\\switzerland\\13_rheinfall.jpg',
 'Zermatt': 'images\\switzerland\\14_zermatt.jpg',
 'Lion Monument': 'images\\switzerland\\15_lion_monument.jpg',
 'Schilthorn': 'images\\switzerland\\16_schilthorn.jpg',
 'Lenzburg Castle': 'images

In [10]:
def ask_question_about_region(region_list:list, region_paths:dict, question:str):
    answers = []
    for item in region_list:
        answer, confidence = vilt_vqa(region_paths[item], question, False)
        answers.append((item, answer, confidence))
    return answers

In [11]:
def ask_question_about_all(question:str):
    print(f"Predicting answers for the question '{question}'")
    switzerland_answers = ask_question_about_region(switzerland_list, switzerland_paths, question)
    print(f"Switzerland completed")
    international_answers = ask_question_about_region(international_list, international_paths, question)
    print(f"International completed")
    grisons_answers = ask_question_about_region(grisons_list, grisons_paths, question)
    print(f"Grisons completed\n------------------------------------------------")
    return switzerland_answers, international_answers, grisons_answers

In [12]:
def build_df(question:str, switzerland_answers:list, international_answers:list, grisons_answers:list, write_csv:bool):
    # Create DataFrames
    df_switzerland = pd.DataFrame(switzerland_answers, columns=["Location", "Answer", "Confidence"])
    df_international = pd.DataFrame(international_answers, columns=["Location", "Answer", "Confidence"])
    df_grisons = pd.DataFrame(grisons_answers, columns=["Location", "Answer", "Confidence"])

    # Add a "Region" column to each DataFrame
    df_switzerland["Region"] = "Switzerland"
    df_international["Region"] = "International"
    df_grisons["Region"] = "Grisons"

    # Add a "Question" column to each DataFrame
    df_switzerland.insert(0, "Question", question)
    df_international.insert(0, "Question", question)
    df_grisons.insert(0, "Question", question)

    # Concatenate the DataFrames
    df_combined = pd.concat([df_switzerland, df_international, df_grisons], ignore_index=True)

    # write csv if desired
    if write_csv == True:
        string = question.lower().replace(" ", "_").replace("?", "")
        string += ".csv"
        path = os.path.join("predictions", string)
        df_combined.to_csv(path, index=False, encoding='utf-8')

    return df_combined

In [120]:
question = "What is this?"
switzerland_answers, international_answers, grisons_answers = ask_question_about_all(question)
df = build_df(question, switzerland_answers, international_answers, grisons_answers, True)

switzerland_rows = df[df['Region'] == 'Switzerland']
average_confidence_switzerland = switzerland_rows['Confidence'].mean()

international_rows = df[df['Region'] == 'International']
average_confidence_international = international_rows['Confidence'].mean()

grisons_rows = df[df['Region'] == 'Grisons']
average_confidence_grisons = grisons_rows['Confidence'].mean()

average_confidence = df['Confidence'].mean()

print(f"Average Confidence Switzerland:\t\t{average_confidence_switzerland:.4f}")
print(f"Average Confidence International:\t{average_confidence_international:.4f}")
print(f"Average Confidence Grisons:\t\t{average_confidence_grisons:.4f}")
print(f"Average Confidence Total:\t\t{average_confidence:.4f}")
print("------------------------------------------------")
df

Predicting answers for the question 'What is this?'
Switzerland completed
International completed
Grisons completed
------------------------------------------------
Average Confidence Switzerland:		0.6026
Average Confidence International:	0.6071
Average Confidence Grisons:		0.5825
Average Confidence Total:		0.5974
------------------------------------------------


Unnamed: 0,Question,Location,Answer,Confidence,Region
0,What is this?,Matterhorn,mountain,0.9562,Switzerland
1,What is this?,Bundeshaus,building,0.6159,Switzerland
2,What is this?,Chateau de Chillon,castle,0.9067,Switzerland
3,What is this?,Creux du Van,bridge,0.9412,Switzerland
4,What is this?,Bern Old City,bridge,0.147,Switzerland
5,What is this?,Jungfraujoch,mountain,0.8022,Switzerland
6,What is this?,Kapellbrücke,bridge,0.6213,Switzerland
7,What is this?,Ascona,beach,0.5799,Switzerland
8,What is this?,Grindelwald,ski resort,0.5198,Switzerland
9,What is this?,Chateau de Gruyeres,castle,0.8702,Switzerland


In [14]:
question_list = ["What is this?", "What is this called?", "What is the name of this", "What country is this?", "Where is this?", "Which region is this in?", "What can people do here?", "What is this famous for?", "What is special here?", "What is the best time to visit here?", "Is this expensive?", "Is this in nature?", "Is this in a city?", "What sport can I do here?", "Is this a tourist attraction?", "What are activities here?"]
df_list = []

In [123]:
for i in range(len(question_list)):
    switzerland_answers, international_answers, grisons_answers = ask_question_about_all(question_list[i])
    df= build_df(question_list[i], switzerland_answers, international_answers, grisons_answers, write_csv=True)
    df_list.append(df)

    switzerland_rows = df[df['Region'] == 'Switzerland']
    average_confidence_switzerland = switzerland_rows['Confidence'].mean()

    international_rows = df[df['Region'] == 'International']
    average_confidence_international = international_rows['Confidence'].mean()

    grisons_rows = df[df['Region'] == 'Grisons']
    average_confidence_grisons = grisons_rows['Confidence'].mean()

    average_confidence = df['Confidence'].mean()
    
    print(f"Average Confidence Switzerland:\t\t{average_confidence_switzerland:.4f}")
    print(f"Average Confidence International:\t{average_confidence_international:.4f}")
    print(f"Average Confidence Grisons:\t\t{average_confidence_grisons:.4f}")
    print(f"Average Confidence Total:\t\t{average_confidence:.4f}")
    print("------------------------------------------------")

Predicting answers for the question 'What is this?'
Switzerland completed
International completed
Grisons completed
------------------------------------------------
Average Confidence Switzerland:		0.6026
Average Confidence International:	0.6071
Average Confidence Grisons:		0.5825
Average Confidence Total:		0.5974
------------------------------------------------
Predicting answers for the question 'What is this called?'
Switzerland completed
International completed
Grisons completed
------------------------------------------------
Average Confidence Switzerland:		0.5977
Average Confidence International:	0.5884
Average Confidence Grisons:		0.5229
Average Confidence Total:		0.5697
------------------------------------------------
Predicting answers for the question 'What is the name of this'
Switzerland completed
International completed
Grisons completed
------------------------------------------------
Average Confidence Switzerland:		0.4416
Average Confidence International:	0.3518
Averag

In [None]:
# looking at results
df_list[0]

In [16]:
switzerland_answers, international_answers, grisons_answers = ask_question_about_all("What language do people speak here?")
df= build_df("What language do people speak here?", switzerland_answers, international_answers, grisons_answers, write_csv=True)
df

Predicting answers for the question 'What language do people speak here?'
Switzerland completed
International completed
Grisons completed
------------------------------------------------


Unnamed: 0,Question,Location,Answer,Confidence,Region
0,What language do people speak here?,Matterhorn,english,0.3019,Switzerland
1,What language do people speak here?,Bundeshaus,english,0.7522,Switzerland
2,What language do people speak here?,Chateau de Chillon,spanish,0.3803,Switzerland
3,What language do people speak here?,Creux du Van,spanish,0.2742,Switzerland
4,What language do people speak here?,Bern Old City,chinese,0.3457,Switzerland
5,What language do people speak here?,Jungfraujoch,english,0.2761,Switzerland
6,What language do people speak here?,Kapellbrücke,french,0.2807,Switzerland
7,What language do people speak here?,Ascona,spanish,0.4287,Switzerland
8,What language do people speak here?,Grindelwald,french,0.3174,Switzerland
9,What language do people speak here?,Chateau de Gruyeres,spanish,0.2644,Switzerland


____________________
## Testing 1: Advanced question structure
**Theory**: To achieve better results, two questions can be chained.

In some situations while testing, answers were better when the question is more specific.

Generic Question: Picture - Eiffel Tower; Question - What is the name of this?; Answer - tower

Specific Question: Picture - Eiffel Tower; Question - What is the name of this tower?; Answer: eiffel tower

**Goal**: Extract information with a first question "What is this?" Answer: tower

Then, a second question will be formulated, integrating the answer of the first question. "What is the name of this tower?" Answer: eiffel tower

In [151]:
# complex question structure
question_1 = "What is this?"
switzerland_answers_1, international_answers_1, grisons_answers_1 = ask_question_about_all(question_1)

switzerland_answers_2 = []
for i in range(len(switzerland_answers_1)):
    question_2 = f"What is the name of this {switzerland_answers_1[i][1]}?"
    print(f"Question 2: {question_2}")

    if i < 9:
        number_string = "0" + str(i+1)
    else:
        number_string = str(i+1)
    path = build_path("switzerland", number_string, switzerland_list[i])
    print(path)
    answer_2, confidence_2 = vilt_vqa(switzerland_paths[switzerland_list[i]], question_2, False)
    switzerland_answers_2.append((switzerland_list[i], switzerland_answers_1[i][1], answer_2, confidence_2))
print("Switzerland 2 completed")

international_answers_2 = []
for i in range(len(international_answers_1)):
    question_2 = f"What is the name of this {international_answers_1[i][1]}?"
    print(f"Question 2: {question_2}")

    if i < 9:
        number_string = "0" + str(i+1)
    else:
        number_string = str(i+1)
    path = build_path("international", number_string, international_list[i])
    print(path)
    answer_2, confidence_2 = vilt_vqa(international_paths[international_list[i]], question_2, False)
    international_answers_2.append((international_list[i], international_answers_1[i][1], answer_2, confidence_2))
print("International 2 completed")

grisons_answers_2 = []
for i in range(len(grisons_answers_1)):
    question_2 = f"What is the name of this {grisons_answers_1[i][1]}?"
    print(f"Question 2: {question_2}")

    if i < 9:
        number_string = "0" + str(i+1)
    else:
        number_string = str(i+1)
    path = build_path("grisons", number_string, grisons_list[i])
    print(path)
    answer_2, confidence_2 = vilt_vqa(grisons_paths[grisons_list[i]], question_2, False)
    grisons_answers_2.append((grisons_list[i], grisons_answers_1[i][1], answer_2, confidence_2))
print("Grisons 2 completed")


Predicting answers for the question 'What is this?'
Switzerland completed
International completed
Grisons completed
------------------------------------------------
Question 2: What is the name of this mountain?
images\switzerland\01_matterhorn.jpg
Question 2: What is the name of this building?
images\switzerland\02_bundeshaus.jpg
Question 2: What is the name of this castle?
images\switzerland\03_chateau_de_chillon.jpg
Question 2: What is the name of this bridge?
images\switzerland\04_creux_du_van.jpg
Question 2: What is the name of this bridge?
images\switzerland\05_bern_old_city.jpg
Question 2: What is the name of this mountain?
images\switzerland\06_jungfraujoch.jpg
Question 2: What is the name of this bridge?
images\switzerland\07_kapellbruecke.jpg
Question 2: What is the name of this beach?
images\switzerland\08_ascona.jpg
Question 2: What is the name of this ski resort?
images\switzerland\09_grindelwald.jpg
Question 2: What is the name of this castle?
images\switzerland\10_chatea

In [153]:
switzerland_answers_2

[('Matterhorn', 'mountain', 'unknown', 0.1681),
 ('Bundeshaus', 'building', 'white house', 0.324),
 ('Chateau de Chillon', 'castle', 'castle', 0.2481),
 ('Creux du Van', 'bridge', 'bridge', 0.1234),
 ('Bern Old City', 'bridge', 'golden gate', 0.1429),
 ('Jungfraujoch', 'mountain', 'alps', 0.2876),
 ('Kapellbrücke', 'bridge', "don't know", 0.1718),
 ('Ascona', 'beach', 'unknown', 0.2161),
 ('Grindelwald', 'ski resort', 'unknown', 0.2085),
 ('Chateau de Gruyeres', 'castle', 'castle', 0.3097),
 ('Aletsch Glacier', 'mountains', 'alps', 0.6846),
 ('Lauterbrunnen', 'mountains', 'alps', 0.5327),
 ('Rheinfall', 'bridge', 'train', 0.3492),
 ('Zermatt', 'ski resort', 'unknown', 0.1693),
 ('Lion Monument', 'oven', 'pizza', 0.1285),
 ('Schilthorn', 'mountain', 'alps', 0.4925),
 ('Lenzburg Castle', 'house', 'house', 0.143),
 ('Grossmünster', 'clock tower', 'big ben', 0.9954),
 ("Jet d'Eau Lake Geneva", 'water', 'ocean', 0.163),
 ('Zytglogge', 'clock tower', 'big ben', 0.5711)]

In [154]:
international_answers_2

[('Eiffel Tower', 'tower', 'eiffel tower', 0.2466),
 ('Uluru', 'mountain', 'rocky', 0.1166),
 ('Louvre', 'building', 'big ben', 0.2527),
 ('Lady Liberty', 'statue', 'washington', 0.1546),
 ('Colosseum', 'building', 'church', 0.0849),
 ('Niagara Falls', 'water', 'ocean', 0.1413),
 ('Disney World', 'clock tower', 'big ben', 0.9809),
 ('Forbidden City', 'china', 'china', 0.3814),
 ('Golden Gate Bridge', 'bridge', 'golden gate', 0.8134),
 ('Times Square', 'city', 'new york', 0.8351),
 ('Notre Dame', 'church', 'church', 0.4182),
 ('Sagrada Familia', 'building', 'church', 0.9403),
 ('Neuschwanstein Castle', 'castle', 'castle', 0.7632),
 ('Sydney Opera House', 'boat', 'boat', 0.0556),
 ('Great Wall', 'mountain', 'unknown', 0.2253),
 ('Florence Cathedral', 'church', 'church', 0.5883),
 ('Mount Rushmore', 'statue', 'unknown', 0.0812),
 ('Brooklyn Bridge', 'bridge', 'golden gate', 0.6032),
 ('Grand Canyon', 'mountains', 'unknown', 0.1753),
 ('Las Vegas Strip', 'city', 'new york', 0.7776)]

In [155]:
grisons_answers_2

[('Arosa', 'lake', 'lake', 0.2391),
 ('Bergün', 'street', 'main', 0.6778),
 ('Lai da Palpuogna', 'lake', 'unknown', 0.2017),
 ('samedan', 'mountain', 'alps', 0.201),
 ('S-chanf', 'bridge', "don't know", 0.1241),
 ('Landwasser Viaduct', 'bridge', 'train', 0.2434),
 ('Brusio Spiral Viaduct', 'train', 'thomas', 0.3048),
 ('Chur', 'city', 'london', 0.2499),
 ('Lake Silvaplana', 'kites', 'kites', 0.2004),
 ('St. Moritz', 'city', 'london', 0.2665),
 ('Lake Cauma', 'snow', 'snow', 0.3717),
 ('White Turf', 'horses', 'horses', 0.1757),
 ('Guarda', 'kite', 'bird', 0.1412),
 ('Davos Schatzalp', 'train', 'unknown', 0.1434),
 ('Ruinaulta', 'mountains', 'alps', 0.2913),
 ('Maienfeld', 'trees', 'pine', 0.3008),
 ('Muottas Muragl', 'train', 'red', 0.1747),
 ('Oberengadin', 'mountains', 'alps', 0.4051),
 ('San Gian', 'clock tower', 'big ben', 0.6896),
 ('Corviglia Piz Nair', 'ski lift', 'unknown', 0.2302)]

___________
## Testing 2: Plausibility tests with basic images

In [106]:
test_paths = [r"images\test\01_chair.jpg", r"images\test\02_lamp.jpg", r"images\test\03_apple.jpg"]
test_questions = ["What country is this?", "What language do people speak here?", "What can people do here?", "Is this a tourist attraction?", "Is this in Switzerland?", "What is the name of this?", "Can I eat this?", "What is the material of this?"]

for i in range(len(test_questions)):
    for j in range(len(test_paths)):
        test_answer = vilt_vqa(test_paths[j], test_questions[i], False)
        print(f"Image: {os.path.basename(test_paths[j])}, Question: {test_questions[i]}, Answer: {test_answer}")
    print("---------------------------------------------------")


Image: 01_chair.jpg, Question: What country is this?, Answer: ('usa', 0.3684)
Image: 02_lamp.jpg, Question: What country is this?, Answer: ('japan', 0.4154)
Image: 03_apple.jpg, Question: What country is this?, Answer: ('usa', 0.3415)
---------------------------------------------------
Image: 01_chair.jpg, Question: What language do people speak here?, Answer: ('english', 0.8774)
Image: 02_lamp.jpg, Question: What language do people speak here?, Answer: ('chinese', 0.3513)
Image: 03_apple.jpg, Question: What language do people speak here?, Answer: ('english', 0.6884)
---------------------------------------------------
Image: 01_chair.jpg, Question: What can people do here?, Answer: ('eat', 0.2086)
Image: 02_lamp.jpg, Question: What can people do here?, Answer: ('pee', 0.0966)
Image: 03_apple.jpg, Question: What can people do here?, Answer: ('eat', 0.7822)
---------------------------------------------------
Image: 01_chair.jpg, Question: Is this a tourist attraction?, Answer: ('no', 0.9

___________________________________
## Testing 3: trying out specific questions / pictures

In [156]:
# prepare image + question
image_path = r'images\international\01_eiffel_tower.jpg'
image = Image.open(image_path)
text = "What is the name of this?"

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# prepare inputs
encoding = processor(image, text, return_tensors="pt")

# forward pass
outputs = model(**encoding)
logits = outputs.logits
idx = logits.argmax(-1).item()
print("Predicted answer:", model.config.id2label[idx])

Predicted answer: tower


In [152]:
# prepare image + question
image_path = r'images\international\01_eiffel_tower.jpg'
image = Image.open(image_path)
text = "What is the name of this tower?"

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# prepare inputs
encoding = processor(image, text, return_tensors="pt")

# forward pass
outputs = model(**encoding)
logits = outputs.logits
idx = logits.argmax(-1).item()
print("Predicted answer:", model.config.id2label[idx])

Predicted answer: eiffel tower


In [29]:
# prepare image + question
image_path = r'images\international\eiffelturm.jpg'
image = Image.open(image_path)
text = "Where is this?"

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# prepare inputs
encoding = processor(image, text, return_tensors="pt")

# forward pass
outputs = model(**encoding)
logits = outputs.logits
idx = logits.argmax(-1).item()
print("Predicted answer:", model.config.id2label[idx])

Predicted answer: park


In [30]:
# prepare image + question
image_path = r'images\international\eiffelturm.jpg'
image = Image.open(image_path)
text = "What city is this?"

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# prepare inputs
encoding = processor(image, text, return_tensors="pt")

# forward pass
outputs = model(**encoding)
logits = outputs.logits
idx = logits.argmax(-1).item()
print("Predicted answer:", model.config.id2label[idx])

Predicted answer: paris


In [33]:
# prepare image + question
image_path = r'images\international\eiffelturm.jpg'
image = Image.open(image_path)
text = "Which country is this?"

processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

# prepare inputs
encoding = processor(image, text, return_tensors="pt")

# forward pass
outputs = model(**encoding)
logits = outputs.logits
idx = logits.argmax(-1).item()
print("Predicted answer:", model.config.id2label[idx])

Predicted answer: usa
