This file is used to transform video to descriptive text, which use gemini-2.5-flash api. 

## CONFIGURATIONS

In [None]:
CONFIG = {
    "gemini_api_key": "YOUR_GEMINI_API_KEY",
    "set_proxy_net": True,
    "net_proxy": "http://127.0.0.1:7890", 
    "video_file_path": "data\\videos", # Delay between API calls to avoid rate limits
    "rate_limit_delay": 2, 
    "max_wait_time": 180, # Maximum time to wait for video to be processed
    "save_path": "data\\metadata\\video_descriptions_en_short.json"
}

## Base setting

In [None]:
import os
import google.generativeai as genai

# 设置代理（解决网络连接问题）
if CONFIG["set_proxy_net"]: 
    os.environ["https_proxy"] = CONFIG["net_proxy"]
    os.environ["http_proxy"] = CONFIG["net_proxy"]

if not CONFIG["gemini_api_key"]:
    print("Please set your Gemini API key first!")
    print("\nPlease follow these steps to set up API key:")
    print("1. Visit https://makersuite.google.com/app/apikey")
    print("2. Create a new API key")
    print("3. Set the API key in your environment variables as 'google_api_key'")

genai.configure(api_key=CONFIG["gemini_api_key"])
model = genai.GenerativeModel('gemini-2.5-flash')
print("Gemini Video Description Generator (EN) initialized successfully")

## Define video2text function

In [None]:
import time

def video2text(video_path):
    # 获取视频文件大小（MB）
    file_size = os.path.getsize(video_path) / (1024 * 1024)
    print(f"Uploading video {os.path.basename(video_path)} ({file_size:.1f}MB)...")
    
    # 上传视频文件到Gemini
    video_file = genai.upload_file(path=video_path)
    print(f"Video uploaded successfully: {video_file.uri}")

    print(f"Waiting for Google to process video file {os.path.basename(video_path)}...")
    max_wait_time = CONFIG["max_wait_time"]  # 最大等待5分钟
    wait_start = time.time()

    while True: 
        file_info = genai.get_file(name=video_file.name)
        state = file_info.state.name
        if state == "ACTIVE":
            print(f"Video file {os.path.basename(video_path)} processing completed")
            break
        elif state == "FAILED":
            print(f"Video file {os.path.basename(video_path)} processing failed")
            return f"Video file processing failed: {state}"
        else:
            elapsed = time.time() - wait_start
            if elapsed > max_wait_time:
                print(f"Video file processing timeout ({max_wait_time}s)")
                return f"Video file processing timeout: exceeded {max_wait_time}s"
    
    prompt = """Generate a concise, one-sentence description for this video, as if you were writing alt-text for an image. The description should be purely visual and capture the main scene and action in under 70 words."""
    
    # 调用Gemini API生成描述
    print(f"Analyzing video {os.path.basename(video_path)}...")
    response = model.generate_content([prompt, video_file])

    description = response.text.strip()
    print(f"Description generation completed for video {os.path.basename(video_path)}")
    return description


## loop all the videos

In [None]:
from tqdm import tqdm
import json
from IPython.display import clear_output

descriptions_dict = {}

for file in tqdm(os.listdir(CONFIG["video_file_path"]), desc="Processing videos"):
    if file.endswith(".mp4"):
        video_file = os.path.join(CONFIG["video_file_path"], file)
        description = video2text(video_file)
        descriptions_dict[file] = description
        clear_output(wait=True)


with open(CONFIG["save_path"], 'w', encoding='utf-8') as json_file:
    json.dump(descriptions_dict, json_file, ensure_ascii=False, indent=4)