In [199]:
import pandas as pd
import re
import requests
import time
import os
import json
from tqdm import tqdm
from groq import Groq
import base64
import pathlib

import warnings
warnings.filterwarnings("ignore")

from urllib3.exceptions import InsecureRequestWarning
warnings.filterwarnings('ignore', category=InsecureRequestWarning)

from dotenv import load_dotenv

In [200]:
# Create output directory
OUTPUT_DIR = pathlib.Path('Data/llama-3.2')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [201]:
def encode_image_to_base64(image_url):
    """Convert image to base64 string"""
    response = requests.get(image_url, headers={'User-Agent': 'Mozilla/5.0'})
    response.raise_for_status()
    return base64.b64encode(response.content).decode('utf-8')

## Chain Of Thoughts

In [202]:
# def calculate_image_features_title(image_url):
#     try:
#         # Get base64 encoded image
#         base64_image = encode_image_to_base64(image_url)

#         instruction = '''
# Reasoning Process for Title Generation:

# 1. Initial Observation
# - What is immediately visible in the screenshot?
# - What IDE/tool is being used or what kind of code is shown?
# - Are there any error messages or unusual indicators?

# 2. Problem Identification
# - What seems to be the main issue?
# - Which specific components are involved?
# - Is this a configuration, syntax, or runtime issue?

# 3. Title Formulation
# Based on the above analysis, construct a title that:
# - Clearly summarizes the main technical issue
# - Uses relevant technical keywords
# - Is concise and specific
# - Takes inspiration from your trained data on stack overflow
# - Would be easily searchable

# Output Format: 
# TITLE: <<Generated Title>>
# Make sure to start with exactly "TITLE:"
# '''

#         messages = [
#             {
#                 "role": "user",
#                 "content": [
#                     {
#                         "type": "text",
#                         "text": instruction
#                     },
#                     {
#                         "type": "image_url",
#                         "image_url": {
#                             "url": f"data:image/jpeg;base64,{base64_image}"
#                         }
#                     }
#                 ]
#             }
#         ]

#         # Initialize Groq client
#         groq_api_key = os.getenv('GROQ_API_KEY')
#         client = Groq(api_key=groq_api_key)

#         # Make API call
#         chat_completion = client.chat.completions.create(
#             messages=messages,
#             model="llama-3.2-90b-vision-preview",
#             temperature=0,
#             max_tokens=2048
#         )

#         # Extract title from response
#         generation_prompt = chat_completion.choices[0].message.content
#         title = ""
#         if "TITLE:" in generation_prompt:
#             title = generation_prompt.split("TITLE:")[1].strip()

#         print(title)
#         return title
    
#     except Exception as e:
#         print(e)
#         return ""

# def calculate_image_features_body(image_url):
#     try:
#         # Get base64 encoded image
#         base64_image = encode_image_to_base64(image_url)

#         instruction = '''
# Reasoning Process for Body Generation:

# 1. Initial Observation
# - What is immediately visible in the screenshot?
# - What IDE/tool is being used or what kind of code is shown?
# - Are there any error messages or unusual indicators?

# 2. Problem Identification
# - What seems to be the main issue?
# - Which specific components are involved?
# - Is this a configuration, syntax, or runtime issue?

# 3. Context Building
# - What background or programming information is needed to understand this issue?
# - Which framework/language versions are relevant?
# - What might have led to this situation?

# 4. Solution Attempts Analysis
# - What obvious solutions might have been tried?
# - What documentation might be relevant?
# - What troubleshooting steps would make sense?

# 5. Question Body Formulation
# Based on the above analysis, construct a detailed body that:
# - Clearly explains the context and problem
# - Includes all relevant technical details
# - Takes inspiration from your trained data on stack overflow
# - Shows research effort and attempted solutions
# - Is specific and answerable

# Output Format: 
# BODY: <<Generated Body>>
# Make sure to start with exactly "BODY:"
# '''

#         messages = [
#             {
#                 "role": "user",
#                 "content": [
#                     {
#                         "type": "text",
#                         "text": instruction
#                     },
#                     {
#                         "type": "image_url",
#                         "image_url": {
#                             "url": f"data:image/jpeg;base64,{base64_image}"
#                         }
#                     }
#                 ]
#             }
#         ]

#         # Initialize Groq client
#         groq_api_key = os.getenv('GROQ_API_KEY')
#         client = Groq(api_key=groq_api_key)

#         # Make API call
#         chat_completion = client.chat.completions.create(
#             messages=messages,
#             model="llama-3.2-90b-vision-preview",
#             temperature=0,
#             max_tokens=2048
#         )

#         # Extract body from response
#         generation_prompt = chat_completion.choices[0].message.content
#         body = ""
#         if "BODY:" in generation_prompt:
#             body = generation_prompt.split("BODY:")[1].strip()

#         print(body)
#         return body
    
#     except Exception as e:
#         print(e)
#         return ""

# def process_rows(dataframe):
#     total_rows_processed = 0
#     results = []  # List to store results
    
#     # Create log file for tracking progress
#     log_file = OUTPUT_DIR / 'processing_log.txt'
#     with open(log_file, 'w') as f:
#         f.write(f"Processing started at {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
    
#     for i, row in tqdm(dataframe.iterrows(), total=dataframe.shape[0], desc=f'Processing'):
#         print("\n" + "="*50)
#         print(f"Processing Row ID: {row['Id']}")
        
#         # Log progress
#         with open(log_file, 'a') as f:
#             f.write(f"\nProcessing Row ID: {row['Id']} at {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
        
#         image_urls = [url.strip(",") for url in re.findall(r"'([^']*)'", str(row.get('ImageURLs')))]
#         title = row['Title'] 
#         body = row['Body']
        
#         # Store LLM responses for all images of this row
#         title_responses = []
#         body_responses = []
        
#         for image_url in image_urls:
#             print(f"\nProcessing image {len(title_responses) + 1}/{len(image_urls)}")
            
#             # Generate title
#             print("Generating title...")
#             title_text = calculate_image_features_title(image_url)
#             title_responses.append(title_text)
            
#             # Generate body
#             print("Generating body...")
#             body_text = calculate_image_features_body(image_url)
#             body_responses.append(body_text)
            
#             # Log responses
#             with open(log_file, 'a') as f:
#                 f.write(f"Processed image {len(title_responses)}/{len(image_urls)}\n")
#                 f.write(f"Title response: {title_text}\n")
#                 f.write(f"Body response: {body_text}\n")
        
#         # Combine all responses into strings
#         combined_title_response = " ||| ".join(title_responses)
#         combined_body_response = " ||| ".join(body_responses)
        
#         # Store the results
#         results.append({
#             'Id': row['Id'],
#             'Title': title,
#             'Body': body,
#             'ImageURLs': row['ImageURLs'],
#             'llm_title_response': combined_title_response,
#             'llm_body_response': combined_body_response
#         })
        
#         total_rows_processed += 1
#         print(f"Completed processing Row ID: {row['Id']}")
#         print("="*50 + "\n")
        
#         # Save intermediate results every 2 rows without waiting
#         if total_rows_processed % 2 == 0:
#             print("\nSaving intermediate results...")
            
#             # Save intermediate results
#             intermediate_df = pd.DataFrame(results)
#             intermediate_file = OUTPUT_DIR / f'intermediate_results_{time.strftime("%Y%m%d_%H%M%S")}.csv'
#             intermediate_df.to_csv(intermediate_file, index=False)
#             print(f"Saved intermediate results to {intermediate_file}")
            
#             # Log intermediate save
#             with open(log_file, 'a') as f:
#                 f.write(f"\nSaved intermediate results after {total_rows_processed} rows at {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
    
#     # Create final DataFrame and save to CSV
#     final_df = pd.DataFrame(results)
#     final_file = OUTPUT_DIR / f'llm_responses_final_chain_of_thoughts.csv'
#     final_df.to_csv(final_file, index=False)
#     print(f"\nFinal results saved to {final_file}")
    
#     # Log completion
#     with open(log_file, 'a') as f:
#         f.write(f"\nProcessing completed at {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
#         f.write(f"Total rows processed: {total_rows_processed}\n")
    
#     return final_df

# # Load dataset
# load_dotenv()
# dataset = pd.read_csv('Data/filtered_data_matched.csv')

# # Process the dataset
# results_df = process_rows(dataset)

## Few-Shot

In [None]:
import pathlib
import pandas as pd
import time
from tqdm import tqdm
import re
import os
from groq import Groq
from dotenv import load_dotenv
from PIL import Image
import io
import base64
import matplotlib.pyplot as plt
import io
import base64
from PIL import Image, ImageDraw, ImageFont
import matplotlib.image as mpimg
import requests

# Create output directory structure
OUTPUT_DIR = pathlib.Path('Data/llama-3.2/few-shot')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

def resize_image(image, max_dimension):
    width, height = image.size
    if image.mode == "P":
        if "transparency" in image.info:
            image = image.convert("RGBA")
        else:
            image = image.convert("RGB")
    if width > max_dimension or height > max_dimension:
        if width > height:
            new_width = max_dimension
            new_height = int(height * (max_dimension / width))
        else:
            new_height = max_dimension
            new_width = int(width * (max_dimension / height))
        image = image.resize((new_width, new_height), Image.LANCZOS)
    return image

def convert_to_png(image):
    with io.BytesIO() as output:
        image.save(output, format="PNG")
        return output.getvalue()

def download_image(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        return io.BytesIO(response.content)
    except Exception as e:
        print(f"Error downloading image from {url}: {e}")
        return None

def process_image(url, max_size):
    try:
        # Download image
        image_data = download_image(url)
        if not image_data:
            return None
        
        with Image.open(image_data) as image:
            width, height = image.size
            mimetype = image.get_format_mimetype()
            
            # Convert if not PNG or needs resizing
            if mimetype != "image/png" or width > max_size or height > max_size:
                resized_image = resize_image(image, max_size)
                png_image = convert_to_png(resized_image)
                return base64.b64encode(png_image).decode('utf-8')
            else:
                # If already PNG and within size limits
                image_data.seek(0)
                return base64.b64encode(image_data.read()).decode('utf-8')
    except Exception as e:
        print(f"Error processing image: {e}")
        return None

def combine_images_vertical(images, max_size):
    """
    Combine multiple images into a vertical collage with labels
    
    Args:
        images: List of image URLs
        max_size: Maximum dimension for any single image
    
    Returns:
        Base64 encoded string of the combined image
    """
    # Process each image
    processed_images = []
    for img_url in images:
        processed = process_image(img_url, max_size)
        if processed:
            processed_images.append(processed)
    
    if not processed_images:
        return None
    
    # Convert base64 strings back to PIL Images
    pil_images = []
    for img_data in processed_images:
        try:
            img_bytes = base64.b64decode(img_data)
            img = Image.open(io.BytesIO(img_bytes))
            pil_images.append(img)
        except Exception as e:
            print(f"Error converting base64 to image: {e}")
            continue
    
    if not pil_images:
        return None
    
    # Calculate dimensions for the collage
    max_width = max(img.width for img in pil_images)
    total_height = sum(img.height for img in pil_images) + (len(pil_images) * 30)  # Extra space for labels
    
    # Create new image with white background
    collage = Image.new("RGB", (max_width, total_height), "white")
    
    # Create a drawing object
    draw = ImageDraw.Draw(collage)
    
    # Try to load a font, fall back to default if not available
    try:
        font = ImageFont.truetype("arial.ttf", 20)
    except:
        font = ImageFont.load_default()
    
    # Paste images vertically with labels
    y_offset = 0
    for i, img in enumerate(pil_images, 1):
        # Add label
        label_text = f"Image {i}"
        draw.text((10, y_offset), label_text, fill="black", font=font)
        y_offset += 30  # Space for label
        
        # Center the image horizontally
        x_offset = (max_width - img.width) // 2
        collage.paste(img, (x_offset, y_offset))
        y_offset += img.height
    
    # Convert to base64
    buffer = io.BytesIO()
    collage.save(buffer, format="PNG")
    return base64.b64encode(buffer.getvalue()).decode("utf-8")

def combine_and_display_images(images, max_size, save_path=None):
    """
    Combine multiple images vertically, display them, and optionally save
    
    Args:
        images: List of image URLs
        max_size: Maximum dimension for any single image
        save_path: Optional path to save the combined image
    """
    # Get the combined base64 string
    combined_base64 = combine_images_vertical(images, max_size)
    
    if combined_base64:
        # Convert base64 to image
        img_data = base64.b64decode(combined_base64)
        img = Image.open(io.BytesIO(img_data))
        
        # # Display using matplotlib
        # plt.figure(figsize=(15, 15))
        # plt.imshow(img)
        # plt.axis('off')
        # plt.title('Combined Images with Labels')
        # plt.show()
        
        # Save if path provided
        if save_path:
            img.save(save_path)
            print(f"Combined image saved to {save_path}")
        
        return combined_base64
    return None

def encode_images_with_examples(target_image_url, display=True):
    """Combines example images with target image into a single base64 encoded string and displays result"""
    try:
        # Example image paths
        example_images = [
            "https://i.sstatic.net/rUHWv1Ok.png",
            "https://i.sstatic.net/TGFPo9Jj.png"
        ]
        
        # Add target image to the list
        all_images = example_images + [target_image_url]
        
        # Combine and display images
        save_path = str(OUTPUT_DIR / f'combined_image_{time.strftime("%Y%m%d_%H%M%S")}.png')
        return combine_and_display_images(all_images, 1024, save_path)
            
    except Exception as e:
        print(f"Error combining images: {e}")
        return None
    

def calculate_image_features_title(combined_base64):
    """Generate title using combined image"""
    try:
        instruction = '''
You are an expert software developer and Stack Overflow analyst. You are given the following images and stack overflow post titles as examples:

The first two images are examples with their corresponding titles:
1. "Trying to Stack 2 Columns into one Excel"
2. "is getenv_s not part of cstdlib?"

Now generate a similar title for the third image in the screenshot:
Follow the pattern:
1. Clear and concise title that summarizes the main issue
2. Take inspiration from the example titles given
3. Create it based on the third image and not the examples

Output Format: 
TITLE: <<Generated Title>>
Make sure to start with exactly "TITLE:" 
'''

        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": instruction
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{combined_base64}"
                        }
                    }
                ]
            }
        ]

        # Initialize Groq client
        groq_api_key = os.getenv('GROQ_API_KEY')
        client = Groq(api_key=groq_api_key)

        # Make API call
        chat_completion = client.chat.completions.create(
            messages=messages,
            model="llama-3.2-90b-vision-preview",
            temperature=0,
            max_tokens=4096
        )

        # Extract title from response
        generation_prompt = chat_completion.choices[0].message.content
        title = ""
        if "TITLE:" in generation_prompt:
            title = generation_prompt.split("TITLE:")[1].strip()

        return title
    
    except Exception as e:
        print(f"Error generating title: {e}")
        return ""

def calculate_image_features_body(combined_base64):
    """Generate body using combined image"""
    try:
        instruction = '''
You are an expert software developer and Stack Overflow analyst. You are given the following images and stack overflow post bodies as examples:

The first two images are examples with their corresponding bodies:
1. "I am currently attempting to combine two columns into one but have encountered an error that prevents me from completing this task. Additionally, the data from the second column appears to be pasting incorrectly after the error messages.I would greatly appreciate any assistance with this issue.=IF(P2<>"",P2,INDEX($R$2:$R$5000,ROW()-COUNTA($P$2:$P$5000)))Column P and R contains formulas."

2. "C11 added new bounds-checked functions to the standard library, such as getenv_s.However, when I include <cstdlib>, I do not have std::getenv_s, only getenv_s (global namespace).cppreference has the following note:As with all bounds-checked functions, getenv_s is only guaranteed to be available if __STDC_LIB_EXT1__ is defined by the implementation and if the user defines __STDC_WANT_LIB_EXT1__ to the integer constant 1 before including <stdlib.h>.Even when I define __STDC_WANT_LIB_EXT1__ as 1, My compiler (MSVC C++23) does not find the std::getenv_s function.Isn't <cstdlib> supposed to bring every symbol of <stdlib.h> into the std namespace?"

Now generate a similar question body for the third image in the screenshot:
Follow the pattern:
1. Clear and concise issue report that explains the issue in detail
2. What is the most important part of an error that can be present in the image?
3. Take inspiration from the example report given
4. Create it based on the image and not the examples and generate some response
5. NOTE!! Generate some response for the third image!!


Output Format: 
BODY: <<Generated Body>>
Make sure to start with exactly "BODY:" and generate something no matter what.
'''

        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": instruction
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/jpeg;base64,{combined_base64}"
                        }
                    }
                ]
            }
        ]

        # Initialize Groq client
        groq_api_key = os.getenv('GROQ_API_KEY')
        client = Groq(api_key=groq_api_key)

        # Make API call
        chat_completion = client.chat.completions.create(
            messages=messages,
            model="llama-3.2-11b-vision-preview",
            temperature=0,
            max_tokens=4096
        )

        # Extract body from response
        generation_prompt = chat_completion.choices[0].message.content
        body = ""
        if "BODY:" in generation_prompt:
            body = generation_prompt.split("BODY:")[1].strip()

        return body
    
    except Exception as e:
        print(f"Error generating body: {e}")
        return ""

def process_rows(dataframe):
    total_rows_processed = 0
    results = []
    
    # Create log file
    log_file = OUTPUT_DIR / 'processing_log.txt'
    with open(log_file, 'w') as f:
        f.write(f"Processing started at {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
    
    for i, row in tqdm(dataframe.iterrows(), total=dataframe.shape[0], desc='Processing'):
        print(f"\nProcessing Row ID: {row['Id']}")
        
        # Extract image URLs
        image_urls = [url.strip(",") for url in re.findall(r"'([^']*)'", str(row.get('ImageURLs')))]
        
        if image_urls:
            for image_url in image_urls:
                # Combine example images with target image and display
                combined_base64 = encode_images_with_examples(image_url)
                
                if combined_base64:
                    # Generate title and body using combined image
                    title_response = calculate_image_features_title(combined_base64)
                    print("TITLE:\n" + title_response)
                    body_response = calculate_image_features_body(combined_base64)
                    print("BODY: \n" + body_response)
                    
                    # Store results
                    results.append({
                        'Id': row['Id'],
                        'Title': row['Title'],
                        'Body': row['Body'],
                        'ImageURLs': row['ImageURLs'],
                        'llm_title_response': title_response,
                        'llm_body_response': body_response
                    })
                    
                    # Log progress
                    with open(log_file, 'a') as f:
                        f.write(f"\nProcessed Row ID: {row['Id']} at {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
                        f.write(f"Title response: {title_response}\n")
                        f.write(f"Body response: {body_response}\n")
                    
                    total_rows_processed += 1
                    
                    # Save intermediate results every 2 rows
                    if total_rows_processed % 2 == 0:
                        intermediate_df = pd.DataFrame(results)
                        intermediate_file = OUTPUT_DIR / f'intermediate_results_{time.strftime("%Y%m%d_%H%M%S")}.csv'
                        intermediate_df.to_csv(intermediate_file, index=False)
                        print(f"Saved intermediate results to {intermediate_file}")
        
        print(f"Completed processing Row ID: {row['Id']}")
    
    # Save final results
    final_df = pd.DataFrame(results)
    final_file = OUTPUT_DIR / 'llm_responses_final_few_shot.csv'
    final_df.to_csv(final_file, index=False)
    
    # Log completion
    with open(log_file, 'a') as f:
        f.write(f"\nProcessing completed at {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Total rows processed: {total_rows_processed}\n")
    
    return final_df

# Main execution
if __name__ == "__main__":
    load_dotenv()
    dataset = pd.read_csv('Data/filtered_data_matched.csv')
    results_df = process_rows(dataset)

## IN-Context

In [204]:
# # Clear any existing environment variables
# os.environ.pop('GROQ_API_KEY', None)  # Safely remove if exists

# # Force reload of .env file
# load_dotenv(override=True)  # This will override existing env variables

# # Verify the API key
# groq_api_key = os.getenv('GROQ_API_KEY')
# if groq_api_key:
#     print("New API key loaded successfully")
# else:
#     print("Failed to load API key")

In [205]:

# # Initialize Groq client
# groq_api_key = os.getenv('GROQ_API_KEY')
# client = Groq(api_key=groq_api_key)

# def calculate_image_features_title(image_url):
#     try:
#         # Get base64 encoded image
#         base64_image = encode_image_to_base64(image_url)

#         instruction = '''
# Context: You are an expert programmer experienced in different technology stacks. You encountered an issue while working on a project. The screenshot shows the problem but you are not given any textual content.
# Generate a Stack Overflow title that:
# 1. Follows Stack Overflow guidelines
# 2. Is clear and concise
# 3. Summarizes the main technical issue shown
# 4. Take inspiration from your trained data for stack overflow.

# Output Format: 
# TITLE: <<Generated Title>>

# Make sure to start with exactly "TITLE:"
# '''

#         messages = [
#             {
#                 "role": "user",
#                 "content": [
#                     {
#                         "type": "text",
#                         "text": instruction
#                     },
#                     {
#                         "type": "image_url",
#                         "image_url": {
#                             "url": f"data:image/jpeg;base64,{base64_image}"
#                         }
#                     }
#                 ]
#             }
#         ]


#         # Make API call
#         chat_completion = client.chat.completions.create(
#             messages=messages,
#             model="llama-3.2-90b-vision-preview",
#             temperature=0,
#             max_tokens=2048
#         )

#         # Extract title from response
#         generation_prompt = chat_completion.choices[0].message.content
#         title = ""
#         if "TITLE:" in generation_prompt:
#             title = generation_prompt.split("TITLE:")[1].strip()

#         print(title)
#         return title
    
#     except Exception as e:
#         print(e)
#         return ""

# def calculate_image_features_body(image_url):
#     try:
#         # Get base64 encoded image
#         base64_image = encode_image_to_base64(image_url)

#         instruction = '''
# Context: You are an expert programmer experienced in different technology stacks. You encountered an issue while working on a project. The screenshot shows the problem but you are not given any textual content.
# Generate a detailed Stack Overflow question body that:
# 1. Follows Stack Overflow guidelines
# 2. Includes relevant code/IDE context
# 3. Clearly states the expected vs. actual behavior
# 4. Take inspiration from your trained data for stack overflow.

# Output Format: 
# BODY: <<Generated Body>>

# Make sure to start with exactly "BODY:"
# '''

#         messages = [
#             {
#                 "role": "user",
#                 "content": [
#                     {
#                         "type": "text",
#                         "text": instruction
#                     },
#                     {
#                         "type": "image_url",
#                         "image_url": {
#                             "url": f"data:image/jpeg;base64,{base64_image}"
#                         }
#                     }
#                 ]
#             }
#         ]

#         # Make API call
#         chat_completion = client.chat.completions.create(
#             messages=messages,
#             model="llama-3.2-90b-vision-preview",
#             temperature=0,
#             max_tokens=2048
#         )

#         # Extract body from response
#         generation_prompt = chat_completion.choices[0].message.content
#         body = ""
#         if "BODY:" in generation_prompt:
#             body = generation_prompt.split("BODY:")[1].strip()

#         print(body)
#         return body
    
#     except Exception as e:
#         print(e)
#         return ""

# def process_rows(dataframe):
#     total_rows_processed = 0
#     results = []  # List to store results
    
#     # Create log file for tracking progress
#     log_file = OUTPUT_DIR / 'processing_log.txt'
#     with open(log_file, 'w') as f:
#         f.write(f"Processing started at {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
    
#     for i, row in tqdm(dataframe.iterrows(), total=dataframe.shape[0], desc=f'Processing'):
#         # Add 1-minute delay after every 50 rows
#         if total_rows_processed > 0 and total_rows_processed % 50 == 0:
#             print("\nProcessed 50 rows. Taking a 1-minute break...")
#             with open(log_file, 'a') as f:
#                 f.write(f"\nTaking a 1-minute break after {total_rows_processed} rows at {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
#             time.sleep(60)  # 60 seconds = 1 minute
#             print("Resuming processing...")
        
#         print("\n" + "="*50)
#         print(f"Processing Row ID: {row['Id']}")
        
#         # Log progress
#         with open(log_file, 'a') as f:
#             f.write(f"\nProcessing Row ID: {row['Id']} at {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
        
#         image_urls = [url.strip(",") for url in re.findall(r"'([^']*)'", str(row.get('ImageURLs')))]
#         title = row['Title'] 
#         body = row['Body']
        
#         # Store LLM responses for all images of this row
#         title_responses = []
#         body_responses = []
        
#         for image_url in image_urls:
#             print(f"\nProcessing image {len(title_responses) + 1}/{len(image_urls)}")
            
#             # Generate title
#             print("Generating title...")
#             title_text = calculate_image_features_title(image_url)
#             title_responses.append(title_text)
            
#             # Generate body
#             print("Generating body...")
#             body_text = calculate_image_features_body(image_url)
#             body_responses.append(body_text)
            
#             # Log responses
#             with open(log_file, 'a') as f:
#                 f.write(f"Processed image {len(title_responses)}/{len(image_urls)}\n")
#                 f.write(f"Title response: {title_text}\n")
#                 f.write(f"Body response: {body_text}\n")
        
#         # Combine all responses into strings
#         combined_title_response = " ||| ".join(title_responses)
#         combined_body_response = " ||| ".join(body_responses)
        
#         # Store the results
#         results.append({
#             'Id': row['Id'],
#             'Title': title,
#             'Body': body,
#             'ImageURLs': row['ImageURLs'],
#             'llm_title_response': combined_title_response,
#             'llm_body_response': combined_body_response
#         })
        
#         total_rows_processed += 1
#         print(f"Completed processing Row ID: {row['Id']}")
#         print("="*50 + "\n")
        
#         # Save intermediate results every 2 rows without waiting
#         if total_rows_processed % 2 == 0:
#             print("\nSaving intermediate results...")
            
#             # Save intermediate results
#             intermediate_df = pd.DataFrame(results)
#             intermediate_file = OUTPUT_DIR / f'intermediate_results_{time.strftime("%Y%m%d_%H%M%S")}.csv'
#             intermediate_df.to_csv(intermediate_file, index=False)
#             print(f"Saved intermediate results to {intermediate_file}")
            
#             # Log intermediate save
#             with open(log_file, 'a') as f:
#                 f.write(f"\nSaved intermediate results after {total_rows_processed} rows at {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
    
#     # Create final DataFrame and save to CSV
#     final_df = pd.DataFrame(results)
#     final_file = OUTPUT_DIR / f'llm_responses_final_zero_shot.csv'
#     final_df.to_csv(final_file, index=False)
#     print(f"\nFinal results saved to {final_file}")
    
#     # Log completion
#     with open(log_file, 'a') as f:
#         f.write(f"\nProcessing completed at {time.strftime('%Y-%m-%d %H:%M:%S')}\n")
#         f.write(f"Total rows processed: {total_rows_processed}\n")
    
#     return final_df

# # Load dataset
# load_dotenv()
# dataset = pd.read_csv('Data/filtered_data_matched.csv')

# # Process the dataset
# results_df = process_rows(dataset)

In [206]:
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 143 entries, 0 to 142
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Id                  143 non-null    int64 
 1   Title               143 non-null    object
 2   Body                143 non-null    object
 3   ImageURLs           143 non-null    object
 4   llm_title_response  143 non-null    object
 5   llm_body_response   143 non-null    object
dtypes: int64(1), object(5)
memory usage: 6.8+ KB
