In [1]:
!pip install torch torchvision
!pip install transformers==4.28.1
!pip install pillow
!pip install git+https://github.com/openai/whisper.git
!pip install ffmpeg-python
!git clone https://github.com/salesforce/BLIP.git
%cd BLIP
!pip install -e .
%cd ..


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [2]:
import os
import cv2
import torch
import ffmpeg
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import whisper

device = "cuda" if torch.cuda.is_available() else "cpu"

# BLIP setup
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)

# Whisper setup
whisper_model = whisper.load_model("base")  # puoi cambiare modello se vuoi


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

100%|███████████████████████████████████████| 139M/139M [00:01<00:00, 78.0MiB/s]


In [3]:
def extract_frames(video_path, interval_sec=1.5, min_frames=2):
    vidcap = cv2.VideoCapture(video_path)
    fps = vidcap.get(cv2.CAP_PROP_FPS)
    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration_sec = total_frames / fps

    timestamps = []
    t = 0
    while t < duration_sec:
        timestamps.append(t)
        t += interval_sec

    if len(timestamps) < min_frames:
        timestamps = [0, max(0, duration_sec - 0.5)]

    frames = []
    for t in timestamps:
        vidcap.set(cv2.CAP_PROP_POS_MSEC, t * 1000)
        success, frame = vidcap.read()
        if success:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(Image.fromarray(frame_rgb))

    vidcap.release()
    return frames


def describe_frame(frame: Image.Image):
    inputs = processor(frame, return_tensors="pt").to(device)
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)
    return caption



In [4]:
def extract_audio_from_video(video_path, audio_path="temp_audio.wav"):
    (
        ffmpeg
        .input(video_path)
        .output(audio_path, format='wav', acodec='pcm_s16le', ac=1, ar='16000')
        .overwrite_output()
        .run(quiet=True)
    )
    return audio_path

def transcribe_audio(audio_path):
    result = whisper_model.transcribe(audio_path)
    return result["text"]


In [5]:
def analyze_videos_and_generate_prompt(video_folder, output_json_path, interval_sec=1.5):
    videos = sorted([f for f in os.listdir(video_folder) if f.lower().endswith((".mp4", ".avi", ".mov"))])

    dati = []

    for video_file in videos:
        print(f"\nAnalizzo video: {video_file}")
        path = os.path.join(video_folder, video_file)

        # Estrazione e descrizione frame
        frames = extract_frames(path, interval_sec=interval_sec)
        descrizioni = []
        for i, frame in enumerate(frames):
            desc = describe_frame(frame)
            descrizioni.append(f"Frame {i+1}: {desc}")
            print(f"  Frame {i+1}: {desc}")

        # Estrazione audio e trascrizione
        audio_path = extract_audio_from_video(path)
        transcript = transcribe_audio(audio_path)
        print(f" Trascrizione audio:\n{transcript}\n")

        # Aggiungi i dati di questa clip
        dati.append({
            "nome_clip": video_file,
            "descrizioni_frame": descrizioni,
            "trascrizione_audio": transcript
        })

    # Prompt completo
    prompt_data = {
        "istruzioni": (
            "Riceverai la descrizione visiva (caption dei frame, considera che potrebbero non essere accurati al 100% , se vedi un frame sospetto rispetto agli altri potrebbe essere un outlier) e la trascrizione audio di più clip video. "
            "Il tuo compito è rispondere alle 10 domande per **ogni** clip in modo preciso e puntuale, basandoti solo sui dati ricevuti. "
            "Se le informazioni non sono presenti o non sono chiare, specifica 'Informazione non disponibile'."
            "Una volta che avrai risposto alle 10 domande per ogni clip, devi rispondere un'ultima volta alle 10 domande, questa volta considerando tutte le clip nel loro insieme come una unica."
            "Puoi sfruttare le risposte che hai gia dato sulle singole clip per dare le risposte sul totale, formatta le risposte sul totale come per le precendenti, speficiando della sezione Clip: Totale Video."
        ),
        "domande": [
            "1. Cosa succede in questa clip?",
            "2. Che modello di fucile viene mostrato o recensito?",
            "3. Viene descritto qualche dettaglio tecnico del fucile? Se sì, quali?",
            "4. Viene mostrato il funzionamento pratico del fucile? Ad esempio, il caricamento, lo sparo o il meccanismo interno?",
            "5. Ci sono commenti o opinioni sulle prestazioni del fucile? Se sì, quali?",
            "6. Sono menzionati o mostrati accessori, modifiche o personalizzazioni del fucile?",
            "7. Vengono mostrati test di tiro o prove pratiche? Se sì, su quali bersagli e con quali risultati?",
            "8. Viene fatto un confronto con altri modelli di fucile o armi simili?",
            "9. Ci sono indicazioni sull'utilizzo previsto del fucile? Ad esempio, caccia, tiro sportivo, softair, difesa, collezionismo?",
            "10. L'utente parla di pregi e difetti? Se sì, quali vengono evidenziati?"
        ],
        "formattazione_risposte": (
            "Per ogni clip rispondi alle 10 domande in questo formato:\n"
            "Clip: [nome_clip]\n"
            "Domanda 1: ...\n"
            "Domanda 2: ...\n"
            "...\n"
            "Domanda 10: ...\n"
        ),
        "dati": dati
    }

    os.makedirs(os.path.dirname(output_json_path), exist_ok=True)
    with open(output_json_path, "w", encoding="utf-8") as f:
        json.dump(prompt_data, f, indent=2, ensure_ascii=False)

    print(f"\n✅ Prompt finale salvato in: {output_json_path}")






In [None]:
import json
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

video_folder = "/content/drive/MyDrive/clip_output_univtg/clip_video_3"  # metti la tua cartella
output_json_path = "/content/drive/MyDrive/Prompt_univtg/Prompt_video_3.json"
analyze_videos_and_generate_prompt(video_folder, output_json_path)




Mounted at /content/drive

Analizzo video: clip_00.mp4




  Frame 1: a man holding a rifle and looking at it
  Frame 2: a man holding a rifle and looking at the camera
  Frame 3: a man holding a rifle and looking at it
  Frame 4: a man in camouflage jacket holding a rifle
  Frame 5: a man in a camouflage jacket holding a rifle
  Frame 6: a man in camouflage clothing holding a gun
  Frame 7: a man with a gun in his hand
  Frame 8: a man in a black jacket holding a rifle
  Frame 9: a man in a black jacket holding a rifle
  Frame 10: a man in a black jacket holding a rifle
  Frame 11: a man with a gun in his hand
  Frame 12: a man in a black jacket holding a rifle
  Frame 13: a man in a black jacket holding a rifle
  Frame 14: a man in sunglasses holding a rifle




 Trascrizione audio:
 $1,200, $1,500. You don't need something quite that expensive. But you don't really want to spend $200 or $250. You'd like to go somewhere above that in quality, kind of mid-rank. These are pretty good choices. These are in the $400 and $500 brackets. We have them in stock. Now always check with us for a price and availability.


Analizzo video: clip_01.mp4




  Frame 1: a man holding a rifle on a table
  Frame 2: a man holding a rifle on a table
  Frame 3: a man holding a rifle with a gun in his hand
  Frame 4: a man holding a rifle with a gun in his hand
  Frame 5: a man holding a rifle on a table
  Frame 6: a man holding a rifle with a gun in his hand
  Frame 7: a man holding a rifle on a table
  Frame 8: a man in a black jacket holding a rifle
  Frame 9: a man in a black jacket holding a gun
  Frame 10: a man in a black jacket holding a rifle
  Frame 11: a man in sunglasses holding a rifle
  Frame 12: a man holding a rifle with a gun in his hand
  Frame 13: a man holding a rifle on a table
  Frame 14: a man holding a rifle with a gun in his hand




 Trascrizione audio:
 gives you is you have a really nice large ejection port here. So it's easy if you're single load and to toss around in and when you eject it out it's got plenty of space. Two and three quarter inch shells got plenty of room to move in there. Come right out. So it really contributes to the gun being reliable with those short rounds. So again 18


Analizzo video: clip_02.mp4




  Frame 1: a man holding a rifle on a table
  Frame 2: a man holding a rifle on a table
  Frame 3: a man holding a rifle on a table
  Frame 4: a man holding a rifle on a table
  Frame 5: a man holding a rifle on a table
  Frame 6: a man holding a rifle on a table
  Frame 7: a man holding a rifle on a table
  Frame 8: a man in a shirt is holding a rifle
  Frame 9: a man holding a rifle and pointing it at the camera
  Frame 10: a man holding a rifle with a rifle in his hand
  Frame 11: a man holding a rifle and looking at it
  Frame 12: a man holding a rifle and looking at it
  Frame 13: a man holding a rifle and looking at it
  Frame 14: a man holding a rifle and looking at it




 Trascrizione audio:
 So there are no stock options here. This is what this is. You get this stock configuration, and that's pretty much what you have. You don't really have any way of changing it. So the other one is the supernova. So here is the supernova. Looks a little bit different. Same essential function. Got the same magazine cut off, got the same basic.


Analizzo video: clip_03.mp4




  Frame 1: a man holding a rifle on a table
  Frame 2: a man holding a rifle on a table
  Frame 3: a man holding a rifle on a table
  Frame 4: a man holding a rifle on a table
  Frame 5: a man holding a rifle on a table
  Frame 6: a man holding a rifle on a table
  Frame 7: a man holding a rifle on a table
  Frame 8: a man holding a rifle on a table
  Frame 9: a man holding a rifle on a table
  Frame 10: a man holding a shotgun with a gun in his hand
  Frame 11: a man holding a shotgun with a gun in his hand
  Frame 12: a man holding a shotgun with a gun in his hand
  Frame 13: a man holding a shotgun with a gun in his hand
  Frame 14: a man holding a shotgun on a table




 Trascrizione audio:
 So you can easily clear the chamber without jamming the gun up. You can easily switch from one type of shell to another without jamming the gun up. As soon as you close the pump, it re-enables the tube. So it's a very simple system. Nothing extra you have to do. Controls on these are very simple, very standard. You've got a safety...


Analizzo video: clip_04.mp4




  Frame 1: a man holding a rifle and looking at it
  Frame 2: a man holding a rifle and looking at it
  Frame 3: a man holding a rifle and looking at it
  Frame 4: a man holding a rifle with a gun in his hand
  Frame 5: a man in a shirt is holding a rifle
  Frame 6: a man in a military uniform is holding a rifle
  Frame 7: a man in a uniform sits at a table with a rifle
  Frame 8: a man in a uniform is sitting at a table with a rifle
  Frame 9: a man in a uniform sits at a table with a rifle
  Frame 10: a man in a uniform sits at a table with a rifle
  Frame 11: a man in a uniform is holding a rifle
  Frame 12: a man in a green shirt sits at a table with a rifle
  Frame 13: a man in a uniform sits at a table with a rifle
  Frame 14: a man in a uniform sits at a table with a rifle




 Trascrizione audio:
 really well and they're getting a little bit more than those $200 guns but without going up to that you know $12 or $1500 range. So really worth taking a look at if that's something of your interest. That's all we have for this week. We really appreciate you watching. Thank you very much.


Analizzo video: clip_05.mp4




  Frame 1: a man in sunglasses holding a rifle
  Frame 2: a man in sunglasses holding a rifle
  Frame 3: a man holding a rifle in his hand
  Frame 4: a man holding a rifle in his hand
  Frame 5: a man in sunglasses holding a rifle
  Frame 6: a man holding a rifle and looking at the camera
  Frame 7: a man holding a rifle on a table
  Frame 8: a man holding a rifle on a table
  Frame 9: a man holding a rifle on a table
  Frame 10: a man holding a rifle on a table
  Frame 11: a man holding a rifle with a gun in his hand
  Frame 12: a man holding a rifle on a table
  Frame 13: a man holding a rifle on a table
  Frame 14: a man holding a rifle on a table




 Trascrizione audio:
 actions on them. Something else that they do on these, you've got sling swivel spot mount here on the back. We've also got one up here on the front and the cap, the magazine cap, has a small little extension right on the front of it. The purpose of that is you can take that magazine cap off, push the action pins out to take the trigger guard out when you're taking it.


Analizzo video: clip_06.mp4




  Frame 1: a man holding a rifle on a table
  Frame 2: a man holding a rifle and pointing it at the camera
  Frame 3: a man holding a rifle on a table
  Frame 4: a man holding a rifle on a table
  Frame 5: a man holding a rifle on a table
  Frame 6: a man holding a rifle on a table
  Frame 7: a man holding a rifle with a rifle in his hand
  Frame 8: a man holding a rifle with a gun in his hand
  Frame 9: a man holding a rifle on a table
  Frame 10: a man holding a rifle on a table
  Frame 11: a man holding a rifle on a table
  Frame 12: a man holding a rifle on a table
  Frame 13: a man holding a rifle on a table
  Frame 14: a man holding a rifle on a table




 Trascrizione audio:
 that's kind of nice because you know on some guns you got to reach way out here to find the pump handle. On this one you can actually hold it back here so it keeps you in close where you're tight. And where you're strong so that's nice and it works really well from that position. Another feature that these guns have that is a little unusual for pumps, you see it on semi-autos with some frequency but not some...


✅ Prompt finale salvato in: /content/drive/MyDrive/Prompt_univtg/Prompt_video_3.json
