In [1]:
from transformers import pipeline
from transformers import AutoProcessor, SpeechT5Processor, SpeechT5ForTextToSpeech
from transformers import SpeechT5HifiGan
from datasets import load_dataset
import soundfile as sf
from IPython.display import Audio
import torch
from IPython.display import Image
from nltk.translate.bleu_score import sentence_bleu
import openai
import requests 
import os
import evaluate
import warnings

In [2]:
# Coco Dataset
URL1 = 'http://farm7.staticflickr.com/6169/6242196154_b0bccaf643_z.jpg'
URL2 = 'http://farm6.staticflickr.com/5105/5611042487_8dda44ec45_z.jpg'
URL3 = 'http://farm3.staticflickr.com/2692/4442800444_c9208a7fa2_z.jpg'

In [3]:
vit_gpt2 = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning")

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [4]:
blip_base = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")

In [5]:
Image(url=URL1 , width=400, height=300)

In [6]:
vit_gpt2(URL1, max_new_tokens=100)[0]['generated_text']

'motorcycles parked in a parking lot '

In [7]:
vit_gpt2_output_1 = vit_gpt2(URL1, max_new_tokens=100)[0]['generated_text']

In [8]:
blip_base(URL1, max_new_tokens=100)[0]['generated_text']

'a group of motorcycles parked in a parking lot'

In [9]:
blip_base_output_1 = blip_base(URL1, max_new_tokens=100)[0]['generated_text']

In [10]:
openai.api_key = "" # Input your personal openai key within the "" 

In [11]:
conversation_history = []

In [12]:
def chat_with_gpt(message):
    conversation_history.append("User: " + message)
    
    response = openai.Completion.create(
        engine="text-davinci-003",  # Choose the appropriate engine
        prompt="\n".join(conversation_history),
        temperature=0.7,
        max_tokens=2049
    )
    
    reply = response.choices[0].text.strip()
    
    prefixes_to_remove = ["Answer:", "AI:"]
    for prefix in prefixes_to_remove:
        if reply.startswith(prefix):
            reply = reply[len(prefix):].strip()
    
    conversation_history.append("AI: " + reply)
    
    return reply

In [13]:
vit_gpt2_output_1

'motorcycles parked in a parking lot '

In [14]:
blip_base_output_1

'a group of motorcycles parked in a parking lot'

In [15]:
user_message = "Act as a human, be as elaborative as possible, combined and refined the text output that a human would say to a visually impaired:" + vit_gpt2_output_1 + "and " + blip_base_output_1
experiment1 = chat_with_gpt(user_message)
print(experiment1)

There are several motorcycles parked in a parking lot. There is a large group of them, lined up side by side in neat rows.


In [16]:
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")

In [18]:
inputs = processor(text=experiment1, return_tensors="pt")

In [19]:
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

In [20]:
speaker_embeddings.shape

torch.Size([1, 512])

In [21]:
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

In [22]:
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

In [23]:
speech.shape

torch.Size([121856])

In [24]:
Audio(speech, rate=16000)

In [25]:
Image(url=URL2 , width=400, height=300)

In [26]:
vit_gpt2_output_2 = vit_gpt2(URL2, max_new_tokens=100)[0]['generated_text']

In [28]:
blip_base_output_2 = blip_base(URL2, max_new_tokens=100)[0]['generated_text']

In [29]:
user_message = "Act as a human, be as elaborative as possible, combined and refined the text output that a human would say to a visually impaired:" + vit_gpt2_output_2 + "and " + blip_base_output_2
experiment2 = chat_with_gpt(user_message)
print(experiment2)

I can see a stop sign at an intersection with a street sign. The stop sign is positioned in front of a majestic mountain with a beautiful landscape in the background.


In [30]:
inputs2 = processor(text=experiment2, return_tensors="pt")

In [32]:
speech2 = model.generate_speech(inputs2["input_ids"], speaker_embeddings, vocoder=vocoder)

In [33]:
Audio(speech2, rate=16000)

In [34]:
Image(url=URL3 , width=400, height=300)

In [35]:
vit_gpt2_output_3 = vit_gpt2(URL3, max_new_tokens=100)[0]['generated_text']

In [36]:
blip_base_output_3 = blip_base(URL3, max_new_tokens=100)[0]['generated_text']

In [37]:
user_message = "Act as a human, be as elaborative as possible, combined and refined the text output that a human would say to a visually impaired:" + vit_gpt2_output_3 + "and " + blip_base_output_3
experiment3 = chat_with_gpt(user_message)
print(experiment3)

I see a man on a skateboard performing a trick. He is gliding up a ramp and doing a complicated maneuver in mid-air. It looks like he's enjoying himself and having a great time.


In [38]:
inputs3 = processor(text=experiment3, return_tensors="pt")

In [39]:
speech3 = model.generate_speech(inputs3["input_ids"], speaker_embeddings, vocoder=vocoder)

In [40]:
Audio(speech3, rate=16000)