# Init
Requires GPU to use quantization

In [1]:
!pip install -q -U transformers==4.37.2
!pip install -q bitsandbytes==0.41.3 accelerate==0.25.0
!pip install datamart-profiler
!pip install bitsandbytes
!pip install accelerate

Collecting datamart-profiler
  Downloading datamart_profiler-0.11-py3-none-any.whl (31 kB)
Collecting opentelemetry-api (from datamart-profiler)
  Downloading opentelemetry_api-1.24.0-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting datamart-geo<0.4,>=0.2.3 (from datamart-profiler)
  Downloading datamart_geo-0.3.1-py3-none-any.whl (7.8 kB)
Collecting deprecated>=1.2.6 (from opentelemetry-api->datamart-profiler)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting importlib-metadata<=7.0,>=6.0 (from opentelemetry-api->datamart-profiler)
  Downloading importlib_metadata-7.0.0-py3-none-any.whl (23 kB)
Installing collected packages: importlib-metadata, deprecated, opentelemetry-api, datamart-geo, datamart-profiler
  Attempting uninstall: importlib-metadata
    Found existing installation: importlib_metadata 7.1.0
    Uninstalling importlib_metadata-7.1.0:
      Suc

In [2]:
!pip install pillow



#Video to Frames

In [3]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

Disk version. Saves frames to a folder for human examination/clustering, etc. We can open the images later, if necessary.

In [28]:
import cv2
import math

video_path = "/collie.mp4"
output_path = "/media" #ATTENTION! The contents of this folder will be removed before each run.

def addzeros(i, length):
  ans = str(i)
  while(len(ans)<len(str(length))):
    ans = "0"+ ans
  return ans

import os
import glob

files = glob.glob(output_path+"/*")
for f in files:
    os.remove(f)

num_frames = 10 #desired number of frames

vid_obj = cv2.VideoCapture(video_path)

length = int(vid_obj.get(cv2.CAP_PROP_FRAME_COUNT))
print(length)

frames = []

i = 0
result = True

while(i<length):
    vid_obj.set(cv2.CAP_PROP_POS_FRAMES,i)
    result, image = vid_obj.read()
    frames.append(image)
    output_fullpath = output_path + "/frame" + addzeros(i,length) + ".jpg"
    cv2.imwrite(output_fullpath,image)
    i+=math.ceil(length/num_frames)


208


RAM version. A slightly faster version for direct evaluation with BLIP. No files are stored.

In [29]:
import cv2
import math


vid_obj = cv2.VideoCapture(video_path)

length = int(vid_obj.get(cv2.CAP_PROP_FRAME_COUNT))
print(length)

frames = []

i = 0
result = True

while(i<length):
    vid_obj.set(cv2.CAP_PROP_POS_FRAMES,i)
    result, image = vid_obj.read()
    frames.append(image)
    i+=math.ceil(length/num_frames)

208


# Blip

In [30]:
import os
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# list to store files
res = []

descriptions = []
for raw_image in frames:
    inputs = processor(raw_image, return_tensors="pt")

    out = model.generate(**inputs)
    descriptions.append(processor.decode(out[0], skip_special_tokens=True))

print(descriptions)

['a dog with a purple tooth', 'a dog with its mouth open and its tongue out', 'a dog with its tongue out and its mouth open', 'a dog with a purple tooth', 'a dog with its tongue out and its mouth open', 'a dog with a black face and a white background', 'a dog with a blue collar and a white background', 'a dog getting his teeth brushed by a dentist', 'a dog getting his hair washed with a glove', 'a dog with its head in the air']


#Prompt

In [37]:
prompt = "You are given the following descriptions of frames from the same video: "

for item in descriptions:
  prompt+=item+'; '

prompt+= '.'
prompt+= " Provide a possible desciption of the video, do not mention what was provide to you. ANSWER:"

prompt

'You are given the following descriptions of frames from the same video: a dog with a purple tooth; a dog with its mouth open and its tongue out; a dog with its tongue out and its mouth open; a dog with a purple tooth; a dog with its tongue out and its mouth open; a dog with a black face and a white background; a dog with a blue collar and a white background; a dog getting his teeth brushed by a dentist; a dog getting his hair washed with a glove; a dog with its head in the air; . Provide a possible desciption of the video, do not mention what was provide to you. ANSWER:'

#LLaMA

In [8]:
hf_key =  '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'

!pip install huggingface_hub

from huggingface_hub import notebook_login

notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
from torch import cuda

model_id = 'meta-llama/Llama-2-7b-chat-hf'
device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'

print(device)

cuda:0


In [10]:
from torch import bfloat16
import transformers

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library

bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,  # 4-bit quantization
    bnb_4bit_quant_type='nf4',  # Normalized float 4
    bnb_4bit_use_double_quant=True,  # Second quantization after the first
    bnb_4bit_compute_dtype=bfloat16  # Computation type
)

In [11]:
# Llama 2 Tokenizer
tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)

# Llama 2 Model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    quantization_config=bnb_config,
    device_map='auto',
)
model.eval()

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Lla

In [12]:
# Our text generator
generator = transformers.pipeline(
    model=model, tokenizer=tokenizer,
    task='text-generation',
    temperature=0.1,
    max_new_tokens=500,
    repetition_penalty=1.1
)

In [13]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [38]:
print(prompt)

You are given the following descriptions of frames from the same video: a dog with a purple tooth; a dog with its mouth open and its tongue out; a dog with its tongue out and its mouth open; a dog with a purple tooth; a dog with its tongue out and its mouth open; a dog with a black face and a white background; a dog with a blue collar and a white background; a dog getting his teeth brushed by a dentist; a dog getting his hair washed with a glove; a dog with its head in the air; . Provide a possible desciption of the video, do not mention what was provide to you. ANSWER:


In [39]:
res = generator(prompt)
print(res[0]["generated_text"])

You are given the following descriptions of frames from the same video: a dog with a purple tooth; a dog with its mouth open and its tongue out; a dog with its tongue out and its mouth open; a dog with a purple tooth; a dog with its tongue out and its mouth open; a dog with a black face and a white background; a dog with a blue collar and a white background; a dog getting his teeth brushed by a dentist; a dog getting his hair washed with a glove; a dog with its head in the air; . Provide a possible desciption of the video, do not mention what was provide to you. ANSWER: The video is likely showing various shots of different dogs, possibly in different settings or situations. Some of the dogs may be shown with their mouths open or tongues out, while others may have distinctive markings on their coats. There may also be shots of dogs receiving grooming or medical attention, such as having their teeth brushed or getting their hair washed. Overall, the video appears to show a variety of do

In [40]:
answer = res[0]["generated_text"].replace(prompt,"")
print(answer)

 The video is likely showing various shots of different dogs, possibly in different settings or situations. Some of the dogs may be shown with their mouths open or tongues out, while others may have distinctive markings on their coats. There may also be shots of dogs receiving grooming or medical attention, such as having their teeth brushed or getting their hair washed. Overall, the video appears to show a variety of dogs in different scenarios.
