### *HunyuanVideo-PromptRewrite* Playground

In [36]:
# Cell 1: Core functionality
import requests
from typing import Optional, Dict, Union, Generator, List
import json
from dataclasses import dataclass

@dataclass
class GenerationConfig:
    """Configuration for text generation"""
    max_tokens: int = 512
    temperature: float = 0.0
    top_p: float = 1.0
    repetition_penalty: float = 1.0
    repetition_context_size: int = 20
    debug: bool = False

def generate(prompt: str, config: GenerationConfig, stream: bool = False):
    """
    Generate text with given prompt and config.
    Prompt can contain raw special tokens.
    """
    data = {
        "messages": [{"role": "user", "content": prompt}],
        "max_tokens": config.max_tokens,
        "temperature": config.temperature,
        "top_p": config.top_p,
        "repetition_penalty": config.repetition_penalty,
        "repetition_context_size": config.repetition_context_size,
        "stream": stream
    }
    
    if config.debug:
        print("\nFull prompt:")
        print(prompt)
        print("\nRequest:")
        print(json.dumps(data, indent=2))
    
    try:
        response = requests.post(
            "http://127.0.0.1:8080/v1/chat/completions",
            headers={"Content-Type": "application/json"},
            json=data,
            stream=stream
        )
        response.raise_for_status()
        
        if stream:
            for line in response.iter_lines():
                if line:
                    line = line.decode()
                    if line.startswith("data: "):
                        if line.strip() == "data: [DONE]":
                            break
                        data = json.loads(line[5:])
                        if content := data["choices"][0].get("delta", {}).get("content"):
                            yield content
        else:
            result = response.json()
            if config.debug:
                print("\nResponse:")
                print(json.dumps(result, indent=2))
                
            if 'choices' in result and result['choices']:
                return result["choices"][0]["message"]["content"].strip()
            return None
            
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")
        return None

# Helper function to run generation and handle streaming
def run_generation(prompt: str, config: GenerationConfig, stream: bool = False):
    """Run generation with current settings"""
    if stream:
        for chunk in generate(prompt, config, stream=True):
            print(chunk, end="", flush=True)
        print()  # New line at end
    else:
        result = generate(prompt, config, stream=False)
        print(result)

*Inference Run*

In [54]:
# Cell 2: Prompt Templates and Mode Selection

# The correct prompt templates

normal_template = """<|startoftext|><|start_header_id|>system<|end_header_id|>

Normal mode - Video Recaption Task:

You are a large language model specialized in rewriting video descriptions. Your task is to modify the input description.

0. Preserve ALL information, including style words and technical terms.

1. If the input is in Chinese, translate the entire description to English. 

2. If the input is just one or two words describing an object or person, provide a brief, simple description focusing on basic visual characteristics. Limit the description to 1-2 short sentences.

3. If the input does not include style, lighting, atmosphere, you can make reasonable associations.

4. Output ALL must be in English.

Given Input:
input: "{input_text}"<|eos|>"""

master_template = """<|startoftext|><|start_header_id|>user<|end_header_id|>

Master mode - Video Recaption Task:

You are a large language model specialized in rewriting video descriptions. Your task is to modify the input description.

0. Preserve ALL information, including style words and technical terms.

1. If the input is in Chinese, translate the entire description to English. 

2. If the input is just one or two words describing an object or person, provide a brief, simple description focusing on basic visual characteristics. Limit the description to 1-2 short sentences.

3. If the input does not include style, lighting, atmosphere, you can make reasonable associations.

4. Output ALL must be in English.

Given Input:
input: "{input_text}"<|eos|>"""


PROMPT_TEMPLATE_ENCODE = (
    "<|start_header_id|>system<|end_header_id|>\n\nDescribe the image by detailing the color, shape, size, texture, "
    "quantity, text, spatial relationships of the objects and background:<|eos|>"
    "<|start_header_id|>user<|end_header_id|>\n\n<|eos|>\n\n"
) 
PROMPT_TEMPLATE_ENCODE_VIDEO = (
    "<|start_header_id|>system<|end_header_id|>\n\nDescribe the video by detailing the following aspects: "
    "1. The main content and theme of the video."
    "2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects."
    "3. Actions, events, behaviors temporal relationships, physical movement changes of the objects."
    "4. background environment, light, style and atmosphere."
    "5. camera angles, movements, and transitions used in the video:<|eos|>\n"
    "<|start_header_id|>user<|end_header_id|>\n\n<|eos|>"
)  

# Generation configs
normal_config = GenerationConfig(
    temperature=0.2,
    top_p=0.9,
    repetition_penalty=1.0,
    debug=True,
    max_tokens=256,
)

master_config = GenerationConfig(
    temperature=0.7,
    top_p=0.95,
    repetition_penalty=1.1,
    debug=True,
    max_tokens=256,
)

In [None]:
# Example 1: Using a template
input_text = "shrek dancing"
prompt = normal_template.format(input_text=input_text)
print("Using normal template:")
run_generation(prompt, normal_config, stream=True)

In [None]:
# Use default template with custom system prompt
template_config = GenerationConfig(use_default_template=True, debug=True)
run_generation(
    "shrek dancing",  # Just the user input
    template_config,
    system_prompt=DEFAULT_SYSTEM_PROMPT
)

In [None]:
# Example with streaming
print("\nTesting streaming with master mode:")
stream_prompt = get_rewrite_prompt("drone shot cityscape", mode="Master")
run_generation(stream_prompt, master_config, stream=True)

In [None]:
# Example 2: Custom raw prompt with special tokens
custom_prompt = """{PROMPT_TEMPLATE_ENCODE_VIDEO}
Describe a scene: shrek dancing
<|eot_id|>"""

print("\nUsing custom prompt:")
run_generation(custom_prompt, master_config, stream=True)

In [55]:
# Example 2: Custom raw prompt with special tokens
custom_prompt = f"""{PROMPT_TEMPLATE_ENCODE_VIDEO}
Master mode - Video Recaption Task:

You are a large language model specialized in rewriting video descriptions. Your task is to modify the input description.

0. Preserve ALL information, including style words and technical terms.

1. If the input is in Chinese, translate the entire description to English. 

2. If the input is just one or two words describing an object or person, provide a brief, simple description focusing on basic visual characteristics. Limit the description to 1-2 short sentences.

3. If the input does not include style, lighting, atmosphere, you can make reasonable associations.

4. Output ALL must be in English.

Given Input:
input: "shrek dancing, wide angle shot, directed by ridley scott"<|eos|>"""

print("\nUsing custom prompt:")
run_generation(custom_prompt, master_config, stream=True)


Using custom prompt:

Full prompt:
<|start_header_id|>system<|end_header_id|>

Describe the video by detailing the following aspects: 1. The main content and theme of the video.2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.4. background environment, light, style and atmosphere.5. camera angles, movements, and transitions used in the video:<|eos|>
<|start_header_id|>user<|end_header_id|>

<|eos|>
Master mode - Video Recaption Task:

You are a large language model specialized in rewriting video descriptions. Your task is to modify the input description.

0. Preserve ALL information, including style words and technical terms.

1. If the input is in Chinese, translate the entire description to English. 

2. If the input is just one or two words describing an object or person, provide a brief, simple description focusing on basic visual characteristi

KeyboardInterrupt: 

In [None]:
# Example 2: Custom raw prompt with special tokens
custom_prompt = f"""<|startoftext|><|start_header_id|>system<|end_header_id|>
{PROMPT_TEMPLATE_ENCODE_VIDEO}
<|eot_id|>
<|start_header_id|>user<|end_header_id|>
Describe a scene: shrek dancing
<|eot_id|>"""

print("\nUsing custom prompt:")
run_generation(custom_prompt, master_config, stream=True)

In [None]:
# Use master prompt with normal config
prompt = get_rewrite_prompt("beach sunset", mode="Master")
run_generation(prompt, normal_config)

In [None]:
prompt_with_tokens = f"<|startoftext|>{get_rewrite_prompt('mountain view', mode='Master')}<|eos|>"

In [None]:
custom_prompt = """<|startoftext|><|start_header_id|>system<|end_header_id|>
Your custom system prompt here
<|eot_id|>
<|start_header_id|>user<|end_header_id|>
Your custom user prompt here
<|eot_id|>"""

run_generation(custom_prompt, normal_config)

In [None]:
# Normal mode
prompt = get_rewrite_prompt("sunset beach", mode="Normal")
run_generation(prompt, normal_config)

# Master mode
prompt = get_rewrite_prompt("city lights", mode="Master")
run_generation(prompt, master_config)

In [None]:
custom_config = GenerationConfig(
    temperature=0.5,
    top_p=0.8,
    debug=True
)

