In [80]:
import pandas as pd
import urllib
import re
import requests
from PIL import Image

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from urllib.error import HTTPError
import torch

import multiprocess
from joblib import Parallel, delayed

import warnings
warnings.filterwarnings("ignore")

from urllib3.exceptions import InsecureRequestWarning
warnings.filterwarnings('ignore', category=InsecureRequestWarning)

In [29]:
# Define project information
PROJECT_ID = "cortex-436714"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

# Initialize Vertex AI
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [30]:
from vertexai.generative_models import (
    GenerationConfig,
    GenerativeModel,
    Image,
    Part,
)

In [31]:
text_model = GenerativeModel("gemini-1.5-pro")

In [32]:
import http.client
import typing
import urllib.request

import IPython.display
from PIL import Image as PIL_Image
from PIL import ImageOps as PIL_ImageOps


def display_images(
    images: typing.Iterable[Image],
    max_width: int = 600,
    max_height: int = 350,
) -> None:
    for image in images:
        pil_image = typing.cast(PIL_Image.Image, image._pil_image)
        if pil_image.mode != "RGB":
            # RGB is supported by all Jupyter environments (e.g. RGBA is not yet)
            pil_image = pil_image.convert("RGB")
        image_width, image_height = pil_image.size
        if max_width < image_width or max_height < image_height:
            # Resize to display a smaller notebook image
            pil_image = PIL_ImageOps.contain(pil_image, (max_width, max_height))
        IPython.display.display(pil_image)


# def get_image_bytes_from_url(image_url: str) -> bytes:
#     with urllib.request.urlopen(image_url) as response:
#         response = typing.cast(http.client.HTTPResponse, response)
#         image_bytes = response.read()
#     return image_bytes


# def load_image_from_url(image_url: str) -> Image:
#     image_bytes = get_image_bytes_from_url(image_url)
#     return Image.from_bytes(image_bytes)

def get_image_bytes_from_url(image_url: str) -> bytes:
    response = requests.get(image_url, headers={'User-Agent': 'Mozilla/5.0'})
    response.raise_for_status()  # Raise an error for bad status codes
    return response.content

def load_image_from_url(image_url: str) -> PIL_Image.Image:
    image_bytes = get_image_bytes_from_url(image_url)
    return Image.from_bytes(image_bytes)


def display_content_as_image(content: str | Image | Part) -> bool:
    if not isinstance(content, Image):
        return False
    display_images([content])
    return True


def display_content_as_video(content: str | Image | Part) -> bool:
    if not isinstance(content, Part):
        return False
    part = typing.cast(Part, content)
    file_path = part.file_data.file_uri.removeprefix("gs://")
    video_url = f"https://storage.googleapis.com/{file_path}"
    IPython.display.display(IPython.display.Video(video_url, width=600))
    return True


def print_multimodal_prompt(contents: list[str | Image | Part]):
    """
    Given contents that would be sent to Gemini,
    output the full multimodal prompt for ease of readability.
    """
    for content in contents:
        if display_content_as_image(content):
            continue
        if display_content_as_video(content):
            continue
        print(content)

In [33]:
def calculate_image_features(image_url, textual_summary):
    try:
        image = load_image_from_url(image_url)

        instruction = '''
You are an expert in Software Engineering and are very familiar with screenshots that are sometimes posted on Stack Overflow. Your only task is to infer whether or not the image attached along with textual content is code or IDE related image.
                '''

        prompt = '''
Generate only keywords: Code, IDE-related, Not Related.
'''


        contents = [instruction,
                    textual_summary,
                    image,
                    prompt,]

        # Use a more deterministic configuration with a low temperature
        generation_config = GenerationConfig(
            temperature=0,
            top_p=0.8,
            top_k=40,
            candidate_count=1,
            max_output_tokens=2048,
        )

        responses = text_model.generate_content(
            contents,
            generation_config=generation_config,
            stream=True,
        )

        print("-------Prompt--------")
        print_multimodal_prompt(contents)

        print("\n-------Response--------")
        generation_prompt = ''

        print("\n-------Response--------")
        for response in responses:
            print(response.text, end="\n")
            generation_prompt = generation_prompt + response.text    
        print('\n--------------------------------\n')
        return generation_prompt
    
    except Exception as e:
        print(e)
        return ""

In [34]:
import time
import os
from tqdm import tqdm

def process_rows_with_delay(dataframe):
    total_rows_processed = 0
    
    for i, row in tqdm(dataframe.iterrows(), total=dataframe.shape[0], desc=f'Processing'):
        print("\n" + "="*50)  # Clear separator between rows
        print(f"Processing Row ID: {row['Id']}")  # Print ID of current row
        
        image_urls = [url.strip(",") for url in re.findall(r"'([^']*)'", str(row.get('ImageURLs')))]
        textual_summary = row['Title'] + '\n' + row['Body']

        # Update count
        total_rows_processed += 1
        summary = ''
        for image_url in image_urls:
            temp = calculate_image_features(image_url, textual_summary)
            if temp == '':
                summary = summary
            else:
                summary += temp + '\n'

        print(f"Completed processing Row ID: {row['Id']}")
        print("="*50 + "\n")

        # Add delay after every 3 rows
        if total_rows_processed % 3 == 0:
            print("\nPausing for 60 seconds after processing 3 rows...")
            for remaining in range(60, 0, -1):
                print(f"\rTime remaining: {remaining} seconds...", end='')
                time.sleep(1)
            print("\nResuming processing...")

In [35]:
dataset = pd.read_csv('Data/Data.csv').iloc[21:41]
dataset.head()

Unnamed: 0,Id,PostTypeId,AcceptedAnswerId,ParentId,CreationDate,DeletionDate,Score,ViewCount,Body,OwnerUserId,...,Title,Tags,AnswerCount,CommentCount,FavoriteCount,ClosedDate,CommunityOwnedDate,ContentLicense,CodeText,ImageURLs
21,79146168,1,,,2024-10-31 19:54:54,,1,16,"With Echarts, I dont know how clip the endLabe...",10204632.0,...,Echarts endLabel doesn't clip on zoom,<javascript><echarts>,1,0,,,,CC BY-SA 4.0,"// prettier-ignore\nconst data = [[""2000-06-05...",['https://i.sstatic.net/e8pEyHgv.png']
22,79146160,1,,,2024-10-31 19:51:01,,0,55,A node of a binary tree has an ID and left sub...,23993901.0,...,how to reorder a binary tree,<java>,0,8,,2024-10-31 23:45:06,,CC BY-SA 4.0,class Node{\n public int id;\n public Tr...,"['https://i.sstatic.net/Wian1JBw.png', 'https:..."
23,79146147,1,79149136.0,,2024-10-31 19:44:43,,0,44,I've been using google calendar to track event...,15754761.0,...,Downloading google calendar event attachment,<go><google-cloud-platform><drive>,1,0,,,,CC BY-SA 4.0,credentials.json [Where ServiceAccount and G...,['https://i.sstatic.net/bZtnzuXU.png']
24,79146127,1,,,2024-10-31 19:34:12,,1,29,"I'm using TypeScript, ESM, npm, and ts-jest. U...",9572172.0,...,SyntaxError: Cannot use import statement outsi...,<typescript><jestjs><es6-modules><ts-jest>,0,0,,,,CC BY-SA 4.0,export default {\n preset: 'ts-jest/presets/d...,['https://i.sstatic.net/Jp5wj6k2.png']
25,79146123,1,,,2024-10-31 19:31:13,,0,22,I have a usecase where my workers have run abo...,2289031.0,...,How to handle number of queries that need to b...,<amazon-web-services><aws-lambda><aws-fargate>...,0,6,,,,CC BY-SA 4.0,,['https://i.sstatic.net/WioH15Tw.png']


In [None]:

process_rows_with_delay(dataset)

In [89]:
dataset = pd.read_csv('Data/Data.csv')
dataset.head()

Unnamed: 0,Id,PostTypeId,AcceptedAnswerId,ParentId,CreationDate,DeletionDate,Score,ViewCount,Body,OwnerUserId,...,Title,Tags,AnswerCount,CommentCount,FavoriteCount,ClosedDate,CommunityOwnedDate,ContentLicense,CodeText,ImageURLs
0,79146615,1,,,2024-10-31 23:57:37,,2,22,I need to create a subpage (sub-item) of a pre...,4398952.0,...,Creating a subpage (sub-item) through Notion API,<notion-api><notion>,1,0,,,,CC BY-SA 4.0,Sub item\nParent item\nupdate page\nParent ite...,"['https://i.sstatic.net/yrb2DNc0.png', 'https:..."
1,79146611,1,,,2024-10-31 23:54:59,,2,41,I have a LineChart where axisLeft.axisMinimum ...,21069914.0,...,Why are LineChart lines plotting above the cor...,<java><android><kotlin><mpandroidchart><linegr...,1,0,,,,CC BY-SA 4.0,axisLeft.axisMinimum = -0.5f\nval dataPoints =...,"['https://i.sstatic.net/iVIPKeBj.png', 'https:..."
2,79146597,1,79146765.0,,2024-10-31 23:43:47,,0,33,I'm trying to make the header rounder and exte...,28082335.0,...,How to make a rounded header where the logo is,<html><css>,1,3,,,,CC BY-SA 4.0,,['https://i.sstatic.net/gmviaXIz.jpg']
3,79146594,1,,,2024-10-31 23:41:25,,0,26,"In my Jetpack Compose app, it seems that any t...",673206.0,...,Do not dismiss keyboard when DropdownMenu is o...,<android><android-jetpack-compose><android-com...,1,0,,,,CC BY-SA 4.0,DropdownMenu\nTextField\nDropdownMenu,"['https://i.sstatic.net/9Q7vdZKN.gif', 'https:..."
4,79146582,1,,,2024-10-31 23:28:00,,0,38,I'm using VNRecognizeTextRequest in an OCR ser...,883572.0,...,VNRecognizeTextRequest fails but can select te...,<macos><swiftui><ocr>,1,0,,,,CC BY-SA 4.0,VNRecognizeTextRequest\nVNRecognize\nVNRecogni...,"['https://i.sstatic.net/9sp3EwKN.png', 'https:..."


In [96]:
import ast

# Convert string representation of list to actual list and get length
filtered_df = dataset[dataset['ImageURLs'].apply(lambda x: len(ast.literal_eval(x)) == 1)]

In [97]:
# Each comma indicates an additional URL, so count commas and keep rows with 0 commas
filtered_df = dataset[dataset['ImageURLs'].str.count(',') == 0]

In [98]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2616 entries, 2 to 4156
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Id                     2616 non-null   int64  
 1   PostTypeId             2616 non-null   int64  
 2   AcceptedAnswerId       607 non-null    float64
 3   ParentId               0 non-null      float64
 4   CreationDate           2616 non-null   object 
 5   DeletionDate           0 non-null      float64
 6   Score                  2616 non-null   int64  
 7   ViewCount              2616 non-null   int64  
 8   Body                   2616 non-null   object 
 9   OwnerUserId            2610 non-null   float64
 10  OwnerDisplayName       7 non-null      object 
 11  LastEditorUserId       1411 non-null   float64
 12  LastEditorDisplayName  0 non-null      float64
 13  LastEditDate           1411 non-null   object 
 14  LastActivityDate       2616 non-null   object 
 15  Title    

In [99]:
filtered_df.to_csv('Data/Data.csv')

In [153]:
dataset = pd.read_csv('Data/Data.csv').iloc[2461:2481]

In [154]:
from tqdm import tqdm
import re
from IPython.display import Image, display

def process_rows(dataframe):
    for i, row in tqdm(dataframe.iterrows(), total=dataframe.shape[0], desc=f'Processing'):
        print("\n" + "="*50)  # Clear separator between rows
        print(f"Row ID: {row['Id']}")  # Print ID of current row
        
        # Extract and display images
        image_urls = [url.strip(",") for url in re.findall(r"'([^']*)'", str(row.get('ImageURLs')))]
        print("\nImages:")
        if image_urls:
            for idx, url in enumerate(image_urls, 1):
                print(f"\nImage {idx}:")
                display(Image(url=url))
        else:
            print("No images found")
            
        print("="*50 + "\n")

# Run the processing function
process_rows(dataset)

Processing:   0%|          | 0/20 [00:00<?, ?it/s]


Row ID: 79048749

Images:

Image 1:




Row ID: 79048715

Images:

Image 1:




Row ID: 79048714

Images:

Image 1:




Row ID: 79048534

Images:

Image 1:




Row ID: 79048517

Images:

Image 1:




Row ID: 79048509

Images:

Image 1:




Row ID: 79048500

Images:

Image 1:




Row ID: 79048361

Images:

Image 1:




Row ID: 79048355

Images:

Image 1:




Row ID: 79048098

Images:

Image 1:




Row ID: 79048028

Images:

Image 1:




Row ID: 79047983

Images:

Image 1:




Row ID: 79047976

Images:

Image 1:




Row ID: 79047869

Images:

Image 1:




Row ID: 79047861

Images:

Image 1:




Row ID: 79047727

Images:

Image 1:




Row ID: 79047648

Images:

Image 1:




Row ID: 79047641

Images:

Image 1:




Row ID: 79047610

Images:

Image 1:




Row ID: 79047597

Images:

Image 1:


Processing: 100%|██████████| 20/20 [00:00<00:00, 1344.74it/s]





