In [1]:
import pandas as pd
import numpy as np
import catllm
import json
import os
import regex
import glob
import time
from dotenv import load_dotenv, find_dotenv
import requests
import google.generativeai as genai
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import LinearSegmentedColormap
from pathlib import Path


os.chdir('/Users/chrissoria/Documents/Research/Categorization_AI_experiments')
current_directory = os.getcwd()

dotenv_path = Path('/Users/chrissoria/Documents/Research/Categorization_AI_experiments/.env')
pictures_path = "/Users/chrissoria/Google Drive/other computers/My Laptop (1)/documents/cadas/data/CADAS data upload/Rep Dom/All_Image_Files"

load_dotenv(dotenv_path=dotenv_path)

open_ai_key = os.environ.get("OPENAI_API_KEY")

def process_scored_data(scored_data, pictures_path):
    processed_test_data = {}
    processed_data = {}
    for i in range(1, 5):
        pic_ty_col = f'c_79_{i}_pic_ty'
        pic_col = f'c_79_{i}_pic'
        
        # Convert to numeric with coercion
        scored_data[pic_ty_col] = pd.to_numeric(scored_data[pic_ty_col], errors='coerce')
        
        # Filter rows where pic column meets all criteria
        all_scored_data = scored_data[scored_data[pic_col].notna()]
        all_scored_data = all_scored_data[all_scored_data[pic_col] != '']
        all_scored_data = all_scored_data[all_scored_data[pic_col] != '.v']
        all_scored_data = all_scored_data[all_scored_data[pic_col] != 'i']
        all_scored_data[pic_col] = all_scored_data[pic_col].apply(lambda x: str(x) if not pd.isna(x) else '') + '.jpg'
        all_scored_data[f'{pic_col}_path'] = pictures_path + '/' + all_scored_data[pic_col]
        
        # Filter rows where pic_ty column is not NA
        scored_data_c = scored_data[scored_data[pic_ty_col].notna()]
        print(f'Initial count for {pic_ty_col}:', len(scored_data_c))
        
        # Filter out rows where pic_ty is '.v' or empty string
        scored_data_c = scored_data_c[scored_data_c[pic_ty_col] != '.v']
        print(f'After removing ".v" for {pic_ty_col}:', len(scored_data_c))
        
        scored_data_c = scored_data_c[scored_data_c[pic_ty_col] != '']
        print(f'After removing empty string for {pic_ty_col}:', len(scored_data_c))
        
        # Create paths and select columns
        scored_data_c[pic_col] = scored_data_c[pic_col].apply(lambda x: str(x) if not pd.isna(x) else '') + '.jpg'
        scored_data_c[f'{pic_col}_path'] = pictures_path + '/' + scored_data_c[pic_col]
        scored_data_c = scored_data_c[[pic_col, f'{pic_col}_path', pic_ty_col]]
        
        # Store in dictionary
        processed_test_data[f'scored_data_c_79_{i}'] = scored_data_c
        processed_data[f'all_79_{i}_scored_data'] = all_scored_data
    
    return processed_test_data, processed_data

def process_pic_column(df, pic_col, pictures_path):
    df = df[df[pic_col].notna()]
    df = df[df[pic_col] != '']
    df = df[df[pic_col] != '.v']
    df = df[df[pic_col] != '.i']
    df[pic_col] = df[pic_col].apply(lambda x: str(x) if not pd.isna(x) else '') + '.jpg'
    df[f'{pic_col}_path'] = pictures_path + '/' + df[pic_col]
    return df

ModuleNotFoundError: No module named 'catllm'

In [13]:
more_scored_data = pd.read_excel('/Users/chrissoria/Google Drive/other computers/My Laptop (1)/documents/cadas/data/CADAS data upload/Rep Dom/Figures/all_image_scoring_DR.xlsx')

more_scored_data_c_72_3 = process_pic_column(more_scored_data, 'c_72_3_pic', pictures_path)
more_scored_data_c_72_4 = process_pic_column(more_scored_data, 'c_72_4_pic', pictures_path)

In [1]:
import pkg_resources
try:
    pkg_resources.get_distribution("cat-llm")
    print("cat-llm is installed")
except pkg_resources.DistributionNotFound:
    print("cat-llm is not installed")


cat-llm is installed


In [17]:
c_72_3

c_72_3.to_csv('/Users/chrissoria/Google Drive/other computers/My Laptop (1)/documents/cadas/data/CADAS data upload/Rep Dom/Figures/c_72_3_machine_scored.csv', index=False)

In [12]:
image_categories = ["It has a drawing of a overlapping rectangles next to a reference",
                    "The drawing is not similar to the reference",
                    "The drawing is similar to the reference (regardless of size)",
                    "The drawing of a rectangle 1 has 4 sides",
                    "The drawing of a rectangle 2 has 4 sides",
                    "The drawing of a the rectangles are overlapping",
                    "The drawing of the rectangles overlapping has a longer vertical rectangle with top and bottom sticking out",
                    "None of the above"]

c_72_3 = cat_llm.llm_extract_image_multi_class(
    image_description = "This image reference image of a overlapping rectangles on the top left. The drawing is meant to be similar to the reference overlapping rectangles.", 
    image_input= more_scored_data_c_72_3['c_72_3_pic_path'].tolist(),
    categories = image_categories,
    api_key= os.environ.get("OPENAI_API_KEY"))

c_72_3 = c_72_3.rename(columns={
    "1": "drawing_present",
    "2": "not_similar",
    "3": "similar",
    "4": "rect_1_4_sides",
    "5": "rect_2_4_sides",
    "6": "rectangles_overlap",
    "7": "rect_overlap_top_portrudes",
    "8": "none"
})

c_72_3.to_csv('/Users/chrissoria/Google Drive/other computers/My Laptop (1)/documents/cadas/data/CADAS data upload/Rep Dom/Figures/c_72_3_machine_scored_full.csv', index=False)

Provided a list of 842 images.
Categories to classify:
1. It has a drawing of a overlapping rectangles next to a reference
2. The drawing is not similar to the reference
3. The drawing is similar to the reference (regardless of size)
4. The drawing of a rectangle 1 has 4 sides
5. The drawing of a rectangle 2 has 4 sides
6. The drawing of a the rectangles are overlapping
7. The drawing of the rectangles overlapping has a longer vertical rectangle with top and bottom sticking out
8. None of the above


Categorising images:  77%|███████▋  | 648/842 [1:52:43<21:34,  6.67s/it]    

An error occurred: Error code: 400 - {'error': {'message': 'Invalid base64 image_url.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_base64'}}


Categorising images: 100%|██████████| 842/842 [2:28:26<00:00, 10.58s/it]  


In [18]:
c_72_4.to_csv('/Users/chrissoria/Google Drive/other computers/My Laptop (1)/documents/cadas/data/CADAS data upload/Rep Dom/Figures/c_72_4_machine_scored.csv', index=False)

In [19]:
image_categories = ["It has a drawing of a cube next to the reference",
                    "The drawing is not similar to the reference",
                    "The drawing is similar to the reference (regardless of size)",
                    "The drawing front face looks like a square",
                    "The drawing has a cube with internal lines that match the reference",
                    "The drawing has a cube where front and back squares are parallel",
                    "None of the above"]

c_72_4 = cat_llm.llm_extract_image_multi_class(
    image_description = "This image reference image of a cube on the top left. The drawing is meant to be similar to the reference cube.", 
    image_input= more_scored_data_c_72_4['c_72_4_pic_path'].tolist(),
    categories = image_categories,
    safety=True,
    to_csv = True,
    filename="c_72_4_machine_scored.csv",
    save_directory = "/Users/chrissoria/Google Drive/other computers/My Laptop (1)/documents/cadas/data/CADAS data upload/Rep Dom/Figures/",
    api_key= os.environ.get("OPENAI_API_KEY"))

c_72_4 = c_72_4.rename(columns={
    "1": "drawing_present",
    "2": "not_similar",
    "3": "similar",
    "4": "cube_front_face",
    "5": "cube_internal_lines",
    "6": "cube_opposite_sides",
    "7": "none"
})

c_72_4.to_csv('/Users/chrissoria/Google Drive/other computers/My Laptop (1)/documents/cadas/data/CADAS data upload/Rep Dom/Figures/c_72_4_machine_scored.csv', index=False)

Provided a list of 1025 images.
Categories to classify:
1. It has a drawing of a cube next to the reference
2. The drawing is not similar to the reference
3. The drawing is similar to the reference (regardless of size)
4. The drawing front face looks like a square
5. The drawing has a cube with internal lines that match the reference
6. The drawing has a cube where front and back squares are parallel
7. None of the above


Categorising images: 100%|██████████| 1025/1025 [1:55:23<00:00,  6.75s/it] 
