In [6]:
import ollama
import base64
import os

In [7]:
def encode_image(image_path):
    with open(image_path, 'rb') as f:
        return base64.b64encode(f.read()).decode('utf-8')

In [8]:
# image_path = r"C:\Users\LAKSHYA\OneDrive\Pictures\Camera Roll\WIN_20250614_02_46_47_Pro.jpg"
# image_base64 = encode_image(image_path)
# print(image_base64[:100]) 

In [9]:
image_list = []

In [10]:
def put_image():
    global image_list
    user_input_image = input("Enter image path or press enter to skip: ").strip()
    
    if not user_input_image:
        print("No image inserted")
        return image_list

    image_path = os.path.normpath(user_input_image)
    
    if not os.path.exists(image_path):
        print("Image path not found! Try again or enter to leave blank")
        return put_image()  # Continue to allow more inputs
        



        
    image_base64 = encode_image(image_path)
    image_list.append(image_base64)
    
    # Detect file extension for MIME type
    # ext = os.path.splitext(image_path)[-1].lower()
    # mime_type = 'image/jpeg' if ext in ['.jpg', '.jpeg'] else 'image/png'  # Extend if needed


    return image_list
    
    # return f"data:{mime_type};base64,{image_base64[:100]}"


In [11]:
prompt=  ("System prompt: (You are a compassionate and intelligent visual assistant designed to help people who are blind or visually impaired. "
    "Your job is to look at an image and describe it in a way that helps the user understand the scene clearly. "
    "Use simple, descriptive language and avoid technical terms. Describe what is happening in the image, people's body language, clothing, facial expressions, objects, and surroundings. "
    "Be vivid and precise, as if you are painting a picture with words. "
    "Also, take into account any personal instructions or questions provided by the user—such as describing a specific person, activity, or object. "
    "If the user includes a specific prompt, prioritize that in your description.)")


In [12]:
def put_prompt():
    global prompt
    user_input = input("Put new prompt: ")
    if not user_input:
        print("please enter a prompt")
        return put_prompt()
    prompt += "\nUser: " + user_input
    return prompt


In [12]:
def image_description():
    global prompt

    put_image()
    if not image_list: 
        return "No images available. Skipping..."

    user_prompt = put_prompt()
    full_answer = ""

    for chunk in ollama.generate(
        model='llava:7b-v1.6',
        prompt=user_prompt,
        images=image_list,
        stream=True
    ):
        content = chunk.get("response", "")
        print("\n\n Final Answer:",content, end="", flush=True)  # Live stream to console
        full_answer += content

    prompt += "\nUser: " + user_prompt + "\nAssistant: " + full_answer
    return full_answer


In [15]:
def call_llava():
    image_list.clear()
    for i in range(5):
        print(f"\n Iteration {i+1}")
        answer = image_description()
        print("\n\n Final Answer:", answer)
    


In [None]:
call_llava()

# second week practice on personal project making model faster and smarter by using tools


In [32]:
messages = []


In [20]:
system_content = (
    "You are a helpful assistant for visually impaired users. "
    "You are capable of answering questions directly or calling a function to analyze an image if needed. "
    "There is a list of images available, indexed from 0. "
    "When a user asks a question, first determine whether any image in the list is needed to answer. "
    "If yes, reply in this structured format:\n\n"
    "TOOL_CALL: analyze_image(<image_index_or_range>, prompt='<description_request>')\n\n"
    "If image is not needed, just answer the user directly in plain natural language.\n"
    "Be clear and use descriptive but accessible language suitable for blind users."
)

In [4]:
messages.append({"role":"system","content":system_content})

In [30]:
def chat_loop():
    """Main chat interaction loop (single-turn version)"""
    global image_list, messages
    
    print("\n" + "="*50)
    print("LLaVA Assistant for Visually Impaired Users")
    print("="*50 + "\n")
    
    # Step 1: Load images
    print("Step 1: Add images (optional)")
    put_image()
    messages.append({
        "role": "system", 
        "content": f"There are {len(image_list)} images available (index 0-{len(image_list)-1})."
    })
    
    # Step 2: Single chat interaction
    print("\nStep 2: Ask a question about the images")
    user_content = put_prompt()
    messages.append({"role": "user", "content": user_content})
    
    # Get model response
    try:
        response = ollama.chat(
            model='llava:7b-v1.6',
            messages=messages
        )["message"]["content"]
        print("assistant: ",response)    
        processed_response = process_response(response)
        print(f"\nASSISTANT: {processed_response}\n")
        
    except Exception as e:
        print(f"Error occurred: {e}")
    
    print("\nSession ended. Goodbye!")

In [29]:
def process_response(response):
    """Process the model's response and handle tool calls"""
    if response.strip().startswith("TOOL_CALL:"):
        # Extract image index/range and prompt from TOOL_CALL
        pattern = r"TOOL_CALL:\s*analyze_image\((.*?)\s*,\s*prompt='(.*?)'\)"
        match = re.search(pattern, response, re.DOTALL)
        
        if not match:
            error_msg = "Error: Invalid TOOL_CALL format."
            messages.append({"role": "assistant", "content": error_msg})
            return error_msg
            
        image_expr = match.group(1).strip()
        prompt = match.group(2).strip()
        
        try:
            # Handle different index formats
            if ":" in image_expr:  # Range (e.g., "1:3")
                start, end = map(int, image_expr.split(":"))
                index_or_range = list(range(start, end))
            else:  # Single index
                index_or_range = int(image_expr)
                
            # Validate indices
            max_index = len(image_list) - 1
            if isinstance(index_or_range, list):
                if any(i < 0 or i > max_index for i in index_or_range):
                    error_msg = f"Error: Image index out of range (0-{max_index})."
                    messages.append({"role": "assistant", "content": error_msg})
                    return error_msg
            elif index_or_range < 0 or index_or_range > max_index:
                error_msg = f"Error: Image index out of range (0-{max_index})."
                messages.append({"role": "assistant", "content": error_msg})
                return error_msg
                
            # Perform analysis
            result = analyze_image(index_or_range, prompt)
            print("funtion called")
            messages.append({
                "role": "function",
                "name": "analyze_image",
                "content": result
            })
            
            # Return formatted result
            formatted_result = f"\nIMAGE ANALYSIS RESULT:\n{result}"
            return formatted_result

        except Exception as e:
            error_msg = f"Error processing TOOL_CALL: {e}"
            messages.append({"role": "assistant", "content": error_msg})
            return error_msg
    else:
        messages.append({"role": "assistant", "content": response})
        return response

In [23]:
def analyze_image(index_or_range, prompt):
    """Analyze specific image(s) using LLaVA"""
    global image_list
    
    # Handle single index or range
    if isinstance(index_or_range, int):
        images = [image_list[index_or_range]]
    elif isinstance(index_or_range, list):
        images = [image_list[i] for i in index_or_range]
    else:
        return "Invalid image index/range specified."
    
    if not images:
        return "No images available for analysis."
    
    full_prompt = (
        "Describe the image clearly for a visually impaired user. "
        "Be detailed about objects, people, colors, spatial relationships, "
        "and any important context. "
        f"User's specific request: {prompt}"
    )
    
    output = ""
    try:
        for chunk in ollama.generate(
            model='llava:7b-v1.6',
            prompt=full_prompt,
            images=images,
            stream=True
        ):
            output += chunk.get('response', "")
    except Exception as e:
        return f"Error analyzing image: {e}"
    
    return output


In [None]:
image_list.clear
for i in range(5):
    chat_loop()


LLaVA Assistant for Visually Impaired Users

Step 1: Add images (optional)


Enter image path or press enter to skip:  C:\Users\LAKSHYA\OneDrive\Pictures\Camera Roll\WIN_20250614_02_46_47_Pro.jpg



Step 2: Ask a question about the images


Put new prompt:  descibe this image


assistant:   I'm sorry, but there are no images available for me to describe. Can you please provide the image or let me know which image you would like me to describe? 

ASSISTANT:  I'm sorry, but there are no images available for me to describe. Can you please provide the image or let me know which image you would like me to describe? 


Session ended. Goodbye!

LLaVA Assistant for Visually Impaired Users

Step 1: Add images (optional)
