In [9]:
import os
def get_image_files(folder_path):
    image_extensions = ['.jpeg','.jpg','.png']

    image_files = []
    for file in os.listdir(folder_path):
        if os.path.isfile(os.path.join(folder_path, file)):
            if any(file.lower().endswith(ext) for ext in image_extensions):
                image_files.append(file)

    return image_files

activities=['Activity1','Activity2','Activity3','Activity4','Activity5','Activity6','Activity7','Activity8','Activity9','Activity10','Activity11']
subjects=['Subject15','Subject16','Subject17']
trials=['Trial1','Trial2','Trial3']



In [2]:
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images

def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values

# If you want to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.
path = 'OpenGVLab/InternVL2_5-8B'
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)

  from .autonotebook import tqdm as notebook_tqdm
InternLM2ForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


FlashAttention2 is not installed.


Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 11.34it/s]


In [3]:
from transformers import AutoProcessor

processor = AutoProcessor.from_pretrained(path, trust_remote_code=True, use_fast=False)

In [4]:
# set the max number of tiles in `max_num`
#img_path='/home/abid.abderrazek/llama/evaluation/images/S003C001P002R001A015_rgb_1.jpeg'
#pixel_values = load_image(img_path, max_num=12).to(torch.bfloat16).cuda()
generation_config = dict(max_new_tokens=50, do_sample=True)

# # pure-text conversation (纯文本对话)
# question = 'Hello, how are you?'
# response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
# print(f' {question}\nAssistant: {response}')


In [None]:
# # single-image single-round conversation (单图单轮对话)
# question = '<image>\n what is the person in the image doing. respond in one sentence'
# response = model.chat(tokenizer, pixel_values, question, generation_config)
# print(extract_A_code(img_path))
# print(f'User: {question}\nAssistant: {response}')

In [7]:
import os
def get_image_files(folder_path):
    image_extensions = ['.jpeg','.jpg']

    image_files = []
    for file in os.listdir(folder_path):
        if os.path.isfile(os.path.join(folder_path, file)):
            if any(file.lower().endswith(ext) for ext in image_extensions):
                image_files.append(file)

    return image_files


def get_video_files(folder_path):
    image_extensions = ['.mp4']

    files = []
    for file in os.listdir(folder_path):
        if os.path.isfile(os.path.join(folder_path, file)):
            if any(file.lower().endswith(ext) for ext in image_extensions):
                files.append(file)

    return files

def get_class(element):
    return element.split('_p',1)[0] 
    
#video_files = get_video_files('/home/abid.abderrazek/InternVL/mp4')
#image_files=get_image_files('/home/abid.abderrazek/InternVL/toyota_2_keyframes')

In [7]:
file_set = set(image_files)

# Filter files that have both _0 and _1 versions
filtered_files = [
    file for file in image_files 
    if file.endswith('_0.jpeg') and file.replace('_0.jpeg', '_1.jpeg') in file_set
]

# Include both _0 and _1 versions in the final list
result = []
for file in filtered_files:
    result.append(file)
    result.append(file.replace('_0.jpeg', '_1.jpeg'))

# Remove duplicates and sort the list
result = sorted(set(result))
One_key_frame=[f for f in image_files if f not in result] 


len(result),len(image_files),len(One_key_frame)

NameError: name 'image_files' is not defined

In [None]:
not_done=get_image_files('/home/abid.abderrazek/InternVL/extracted_frames')
vids=get_video_files('/home/abid.abderrazek/InternVL/mp4')
len(result),len(not_done),len(vids)

(22168, 10062, 16115)

In [8]:
classes=set([get_class(s) for s in result])
classes

NameError: name 'result' is not defined

In [9]:
activity_dict = {
    'Cook_Cleandishes': ['cook', 'clean', 'dishes', 'cooking', 'washing'],
    'Cook_Cleanup': ['cook', 'clean', 'up', 'cleaning', 'cooking'],
    'Cook_Cut': ['cook', 'cut', 'cooking', 'chop', 'cutting'],
    'Cook_Stir': ['cook', 'stir', 'cooking', 'mix', 'stirring'],
    'Cook_Usestove': ['cook', 'use', 'stove', 'cooking', 'heat'],
    'Cutbread': ['cut', 'bread', 'slice', 'cutting'],
    'Drink_Frombottle': ['drink', 'bottle', 'drinking', 'water'],
    'Drink_Fromcan': ['drink', 'can', 'drinking', 'soda'],
    'Drink_Fromcup': ['drink', 'cup', 'drinking', 'water'],
    'Drink_Fromglass': ['drink', 'glass', 'drinking', 'water', 'juice'],
    'Eat_Attable': ['eat', 'table', 'eating', 'meal'],
    'Eat_Snack': ['eat', 'snack', 'eating', 'bite'],
    'Enter': ['enter', 'entering', 'come', 'arrival'],
    'Getup': ['get', 'up', 'stand', 'rise', 'standing'],
    'Laydown': ['lay', 'down', 'rest', 'sleep', 'lying'],
    'Leave': ['leave', 'exit', 'go', 'depart'],
    'Makecoffee_Pourgrains': ['make', 'coffee', 'pour', 'grains'],
    'Makecoffee_Pourwater': ['make', 'coffee', 'pour', 'water'],
    'Maketea_Boilwater': ['make', 'tea', 'boil', 'water'],
    'Maketea_Insertteabag': ['make', 'tea', 'insert', 'teabag'],
    'Pour_Frombottle': ['pour', 'bottle', 'pouring', 'liquid'],
    'Pour_Fromcan': ['pour', 'can', 'pouring', 'soda'],
    'Pour_Fromkettle': ['pour', 'kettle', 'tea', 'pouring'],
    'Readbook': ['read', 'book', 'reading', 'story'],
    'Sitdown': ['sit', 'down', 'sitting', 'rest'],
    'Takepills': ['take', 'pills', 'medicine', 'tablet'],
    'Uselaptop': ['use', 'laptop', 'computer', 'typing'],
    'Usetablet': ['use', 'tablet', 'screen', 'device'],
    'Usetelephone': ['use', 'telephone', 'call', 'talk'],
    'Walk': ['walk', 'walking', 'step', 'move'],
    'WatchTV': ['watch', 'TV', 'television', 'viewing', 'screen']
}


In [10]:
result=not_done

In [11]:
result=['extracted_frames/'+s for s in result]
internVL_2f_caps=[]

In [8]:


print(f"pixel_values1 shape: {pixel_values1.shape}")
print(f"pixel_values2 shape: {pixel_values2.shape}")
print(f"pixel_values3 shape: {pixel_values3.shape}")

NameError: name 'pixel_values1' is not defined

### Generating for Thanh Data

In [12]:
gt=[]
used=[]
for subject in subjects:
        for act,activity in zip(acts,activities):
            for trial in trials:
                path=f'{subject}/{activity}/{trial}/Camera1/'
                images=get_image_files(f'{subject}/{activity}/{trial}/Camera1/')
                gt.append(f'{act}')
                used.append(path+'/'+sorted(images)[0])
                used.append(path+'/'+sorted(images)[20])
                used.append(path+'/'+sorted(images)[40])
used
import json 
with open('used_images_for_thanh.json','w') as f:
    json.dump(used,f)

In [5]:
import csv
import json
from PIL import Image
import torch
def get_class(element):
    return element.split('_p',1)[0].split('rames/',1)[1]

csv_file_path = 'ThanhActivity.csv'
counter=0

with open(csv_file_path, mode='a', newline='') as csvfile:#problem happened 384
    # Create the CSV writer
    csv_writer = csv.writer(csvfile)

    # If the file is empty, write the header
    if csvfile.tell() == 0:
        csv_writer.writerow(['Image Name', 'Generated Text'])
    try:
        import pandas as pd
        data =pd.read_csv('ThanhActivity.csv')
        counter=len(data)
    except:
        print('empty csv')
    
    for subject in subjects:
        for activity in activities:
            for trial in trials:
                images=sorted(get_image_files(f'{subject}/{activity}/{trial}/Camera1'))
                print(len(images))
                
                
                pixel_values1 = load_image(f'{subject}/{activity}/{trial}/Camera1/{images[5]}', max_num=12).to(torch.bfloat16).cuda()
                pixel_values2 = load_image(f'{subject}/{activity}/{trial}/Camera1/{images[35]}', max_num=12).to(torch.bfloat16).cuda()

                pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
                num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]

                question =f'''Image-1: <image>\nImage-2: <image>\nYou are a monitoring assistant for patients ,you will be provided with 2 images and 11 activities.
                
                try to understand the difference chronologically between the images and then,
                
                You have to match the images to one of the following activities

                Activities:
                falling forward
                falling on knees
                falling backwards
                falling sideways
                falling from knees
                walking
                standing
                sitting
                picking up
                jumping
                lying
                
                find which is the best option that descibes the images and respond only with the name of the activity """
    
                '''
                response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                            num_patches_list=num_patches_list,
                                            history=None, return_history=True)
                generated_text=response


                # Append the result to CSV
                csv_writer.writerow([f'{subject}/{activity}/{trial}/Camera1', generated_text])


                print(f"{counter}/  {subject}/{activity}/{trial}/Camera1--  Generated Text: ", generated_text)
                counter += 1

182
0/  Subject15/Activity1/Trial1/Camera1--  Generated Text:  falling backwards
177
1/  Subject15/Activity1/Trial2/Camera1--  Generated Text:  flying forward
179
2/  Subject15/Activity1/Trial3/Camera1--  Generated Text:  walking
180
3/  Subject15/Activity2/Trial1/Camera1--  Generated Text:  falling sideways
175
4/  Subject15/Activity2/Trial2/Camera1--  Generated Text:  flying
175
5/  Subject15/Activity2/Trial3/Camera1--  Generated Text:  falling backwards
184
6/  Subject15/Activity3/Trial1/Camera1--  Generated Text:  falling backwards
180
7/  Subject15/Activity3/Trial2/Camera1--  Generated Text:  falling sideways
187
8/  Subject15/Activity3/Trial3/Camera1--  Generated Text:  The images depict a person who began standing and then transitions to lying on their back on the floor. This corresponds to the activity of falling backwards.
184
9/  Subject15/Activity4/Trial1/Camera1--  Generated Text:  falling sideways
174
10/  Subject15/Activity4/Trial2/Camera1--  Generated Text:  falling forw

In [None]:
with open('gpt_preds_thanh.json','r')as f:
    data=json.load(f)
import pandas as pd 
acts=['falling forward',
                'falling on knees',
                'falling backwards',
                'falling sideways',
                'falling from knees',
                'walking',
                'standing',
                'sitting',
                'picking up',
                'jumping',
                'lying']
df=pd.read_csv('ThanhActivity.csv')
df['gpt']=gt
df['ground truth']=gt


Unnamed: 0,Image Name,Generated Text,gpt,ground truth
0,Subject15/Activity1/Trial1/Camera1,falling backwards,falling forward,falling forward
1,Subject15/Activity1/Trial2/Camera1,flying forward,falling forward,falling forward
2,Subject15/Activity1/Trial3/Camera1,walking,falling forward,falling forward
3,Subject15/Activity2/Trial1/Camera1,falling sideways,falling on knees,falling on knees
4,Subject15/Activity2/Trial2/Camera1,flying,falling on knees,falling on knees
...,...,...,...,...
94,Subject17/Activity10/Trial2/Camera1,falling sideways,jumping,jumping
95,Subject17/Activity10/Trial3/Camera1,falling sideways,jumping,jumping
96,Subject17/Activity11/Trial1/Camera1,lying,lying,lying
97,Subject17/Activity11/Trial2/Camera1,lying,lying,lying


In [21]:
df.columns=['Image Name','internVL8B','gpt','ground truth']

In [24]:
df.to_csv('Eval_thanh.csv')

### Gpt for thanh


In [None]:
import base64
import requests
import json
import os

def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')


def gpt(images):
  api_key = "******************************************"
  base64_image1 = encode_image(images[0])
  base64_image2 = encode_image(images[1])
  base64_image3 = encode_image(images[2])
  headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
  }
  payload2 = {
    "model": "gpt-4o-mini",
    "messages": [
  {
    "role": "user",
    "content": [
    {
          "type": "text",
          "text": f"""
                You are a monitoring assistant for patients ,you will be provided with 3 images and 11 activities.
                
                try to understand the difference chronologically between the images and then,
                
                You have to match the images to one of the following activities

                Activities:
                falling forward
                falling on knees
                falling backwards
                falling sideways
                falling from knees
                walking
                standing
                sitting
                picking up
                jumping
                lying
                
                find which is the best option that descibes the images and respond only with the name of the activity """
    },
    {
    "type": "image_url",
    "image_url": {"url": f"data:image/jpeg;base64,{base64_image1}",}
    } ,    
        {
    "type": "image_url",
    "image_url": {"url": f"data:image/jpeg;base64,{base64_image2}",}
    } ,   
        {
    "type": "image_url",
    "image_url": {"url": f"data:image/jpeg;base64,{base64_image3}",}
    } ,   
   ]
  }     
                ],
    "max_tokens": 1500
            }
      
  response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload2)
  gpt_output=response.json()['choices'][0]['message']['content']
  return gpt_output

In [70]:
response=[]
for i in range(0,len(used)-2,3):
    resp=gpt(used[i:i+3])
    response.append(resp)
    print(resp)

falling forward
falling backwards
falling backwards
falling backwards
falling sideways
falling sideways
lying
falling backwards
falling sideways
falling sideways
falling backwards
falling sideways
falling backwards
lying
lying
walking
walking
walking
standing
standing
standing
sitting
sitting
sitting
picking up
picking up
picking up
walking
walking
jumping
lying
sitting
walking
lying
falling forward
lying
lying
falling forwards
lying
lying
falling backwards
falling backwards
lying
lying
lying
lying
lying
lying
walking
walking
walking
standing
standing
standing
sitting
sitting
sitting
picking up
picking up
picking up
standing
standing
standing
lying
lying
lying
lying
lying
lying
lying
lying
lying
lying
lying
sitting
falling backwards
lying
lying
lying
lying
falling backwards
walking
walking
walking
standing
standing
standing
sitting
sitting
sitting
picking up
picking up
picking up
walking
standing
falling forwards
lying
lying
lying


In [71]:
with open('gpt_preds_thanh.json','w')as f :
    json.dump(response,f)

### Generating for Toyota

In [12]:
import csv
import json
from PIL import Image
import torch
def get_class(element):
    return element.split('_p',1)[0].split('rames/',1)[1]

csv_file_path = 'intern_toyota_cations_rest.csv'
l = len(result)
counter=0

with open(csv_file_path, mode='a', newline='') as csvfile:#problem happened 384
    # Create the CSV writer
    csv_writer = csv.writer(csvfile)

    # If the file is empty, write the header
    if csvfile.tell() == 0:
        csv_writer.writerow(['Image Name', 'Generated Text'])
    try:
        import pandas as pd
        data =pd.read_csv('intern_toyota_cations_rest.csv')
        counter=len(data)
    except:
        print('empty csv')

    for i in range(counter*2, len(result), 2):
        image=result[i]
        pixel_values1 = load_image(result[i+1], max_num=12).to(torch.bfloat16).cuda()
        pixel_values2 = load_image(result[i], max_num=12).to(torch.bfloat16).cuda()
        pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
        num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]

        question =f'''Image-1: <image>\nImage-2: <image>\nYou are a monitoring assistant for old patients , you have to focus and describe what is the person doing from the pair of images you are provided with .
        
        respond only with one short sentence  
        Example :
        the person is washing a cup in the kitchen, 
        the person is eating a meal on the couch while watching TV, 
        the person is taking their pills,
        the person is pooring a drink from a bottle,
        the person is entering the room

        this is a hint on the activity in the image to help you generate a better description
        activity: {get_class(result[i])}
        '''
        response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                    num_patches_list=num_patches_list,
                                    history=None, return_history=True)
        generated_text=response


        # Append the result to CSV
        csv_writer.writerow([result[i], generated_text])

        internVL_2f_caps.append(f'{result[i]}--{generated_text}')

        print(f"{2*counter}/{l}  {image}  Generated Text: ", generated_text)
        counter += 1

692/10062  extracted_frames/Maketea_Insertteabag_p15_r00_v14_c07_0.jpg  Generated Text:  The person is stirring something in a cup near the kitchen counter.
694/10062  extracted_frames/Getup_p03_r03_v06_c05_0.jpg  Generated Text:  The person is standing in a room, possibly adjusting or picking up an item from a surface.
696/10062  extracted_frames/Getup_p14_r05_v10_c01_1.jpg  Generated Text:  The person is getting up from a table in a dining area.
698/10062  extracted_frames/Getup_p09_r03_v05_c05_1.jpg  Generated Text:  The person is getting up from the couch.
700/10062  extracted_frames/Walk_p11_r10_v13_c03_0.jpg  Generated Text:  The person is walking through the kitchen.
702/10062  extracted_frames/Drink_Fromcan_p03_r01_v18_c06_1.jpg  Generated Text:  the person is standing in the kitchen, drinking from a can
704/10062  extracted_frames/Walk_p13_r07_v21_c06_0.jpg  Generated Text:  the person is taking a step while entering the living room
706/10062  extracted_frames/Walk_p16_r33_v12

In [14]:



intern_toyota=[]
import pandas as pd
data =pd.read_csv('intern_toyota_cations.csv')

for i in range(0,len(data)):
    generation=f'{data['Image Name'][i]}--{data['Generated Text'][i]}'
    intern_toyota.append(generation)

import json
with open('intern_toyota_cations.json','w')as f:
    json.dump(intern_toyota,f)


### generating for video

In [14]:
internVL_vcaptions=[]
def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
    if bound:
        start, end = bound[0], bound[1]
    else:
        start, end = -100000, 100000
    start_idx = max(first_idx, round(start * fps))
    end_idx = min(round(end * fps), max_frame)
    seg_size = float(end_idx - start_idx) / num_segments
    frame_indices = np.array([
        int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
        for idx in range(num_segments)
    ])
    return frame_indices

def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
    max_frame = len(vr) - 1
    fps = float(vr.get_avg_fps())

    pixel_values_list, num_patches_list = [], []
    transform = build_transform(input_size=input_size)
    frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
    for frame_index in frame_indices:
        img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
        img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
        pixel_values = [transform(tile) for tile in img]
        pixel_values = torch.stack(pixel_values)
        num_patches_list.append(pixel_values.shape[0])
        pixel_values_list.append(pixel_values)
    pixel_values = torch.cat(pixel_values_list)
    return pixel_values, num_patches_list


internVL_vcaptions=[]
for video in video_files :

    video_path = '/home/abid.abderrazek/InternVL/mp4/'+video
    pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
    pixel_values = pixel_values.to(torch.bfloat16).cuda()
    video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
    question = video_prefix + '''You are a monitoring assistant for old patients , you have to focus and describe what is the person doing from the pair of images you are provided with .
    
    respond only with one short sentence  
    Example :
    the person is washing a cup in the kitchen, 
    the person is eating a meal on the couch while watching TV, 
    the person is taking their pills,
    the person is pooring a drink from a bottle,
    the person is entering the room
    the person is sitting down
    '''
    # Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
    response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                                num_patches_list=num_patches_list, history=None, return_history=True)
    
    internVL_vcaptions.append(video+'#'+response)

    print(f'User: {get_class(video)}\nAssistant: {response}')


User: Maketea_Boilwater
Assistant: The person is stirring something in a pot on the stove using a black ladle.
User: Pour_Frombottle
Assistant: The person appears to be engaged in an activity involving a smartphone.
User: Drink_Frombottle
Assistant: The person is taking a drink from a bottle.
User: Enter
Assistant: The person is walking into the room and looking at a device.
User: Makecoffee_Pourwater
Assistant: The person is pouring a drink from a white jug in the kitchen.
User: Leave
Assistant: The person is walking out of the room.
User: Sitdown
Assistant: the person is sitting down
User: Walk
Assistant: The person is tidying up the living room, organizing items on a side table next to the couch.
User: Drink_Fromcup
Assistant: the person is eating a meal on the couch while watching TV.
User: Readbook
Assistant: The person is reading a magazine while sitting on a couch.
User: Drink_Frombottle
Assistant: The person is adjusting a camera on the wall.
User: Sitdown
Assistant: The person

In [15]:
import json 
with open('internvcaps_toyota.json','w') as f :
    json.dump(internVL_vcaptions,f)

In [None]:
question = 'Describe this video in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               num_patches_list=num_patches_list, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')

In [None]:



# single-image multi-round conversation (单图多轮对话)
question = '<image>\nPlease describe the image in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')

question = 'Please write a poem according to the image.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')

# multi-image multi-round conversation, combined images (多图多轮对话，拼接图像)
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)

question = '<image>\nDescribe the two images in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')

question = 'What are the similarities and differences between these two images.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')

# multi-image multi-round conversation, separate images (多图多轮对话，独立图像)
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]

question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               num_patches_list=num_patches_list,
                               history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')

question = 'What are the similarities and differences between these two images.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               num_patches_list=num_patches_list,
                               history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')

# batch inference, single image per sample (单图批处理)
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)

questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
responses = model.batch_chat(tokenizer, pixel_values,
                             num_patches_list=num_patches_list,
                             questions=questions,
                             generation_config=generation_config)
for question, response in zip(questions, responses):
    print(f'User: {question}\nAssistant: {response}')

# video multi-round conversation (视频多轮对话)
def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
    if bound:
        start, end = bound[0], bound[1]
    else:
        start, end = -100000, 100000
    start_idx = max(first_idx, round(start * fps))
    end_idx = min(round(end * fps), max_frame)
    seg_size = float(end_idx - start_idx) / num_segments
    frame_indices = np.array([
        int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
        for idx in range(num_segments)
    ])
    return frame_indices

def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
    max_frame = len(vr) - 1
    fps = float(vr.get_avg_fps())

    pixel_values_list, num_patches_list = [], []
    transform = build_transform(input_size=input_size)
    frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
    for frame_index in frame_indices:
        img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
        img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
        pixel_values = [transform(tile) for tile in img]
        pixel_values = torch.stack(pixel_values)
        num_patches_list.append(pixel_values.shape[0])
        pixel_values_list.append(pixel_values)
    pixel_values = torch.cat(pixel_values_list)
    return pixel_values, num_patches_list

video_path = './examples/red-panda.mp4'
pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
pixel_values = pixel_values.to(torch.bfloat16).cuda()
video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
question = video_prefix + 'What is the red panda doing?'
# Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               num_patches_list=num_patches_list, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')

question = 'Describe this video in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
                               num_patches_list=num_patches_list, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')