In [1]:
%cd ..

/Users/danorel/Workspace/Education/University/NYU/Research/xeda


In [2]:
!pip install --quiet openai python-dotenv boto3 langchain chromadb


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import boto3
import json
import copy
import random
import openai
import itertools
import typing as t

from boto3.session import Session
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

from constants.openai import OPENAI_API_KEY
from constants.aws import (
    AWS_ACCESS_KEY,
    AWS_SECRET_KEY,
    AWS_REGION,
    AWS_BUCKET
)
from data_types.pipeline import Pipeline, PipelineType, PipelineKind

In [4]:
openai.api_key = OPENAI_API_KEY
openai_client = openai.OpenAI()
openai_embeddings = OpenAIEmbeddings()

In [5]:
session = Session(
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key=AWS_SECRET_KEY,
    region_name=AWS_REGION
)
s3 = session.resource('s3')

In [6]:
s3bucket = s3.Bucket(AWS_BUCKET)

In [7]:
def read_pipelines(
    pipeline_folder: str, 
    pipeline_type: PipelineType,
    pipeline_kind: PipelineKind
) -> t.Iterator[Pipeline]:
    for s3file in s3bucket.objects.filter(Prefix=f"{pipeline_folder}/{pipeline_type}/{pipeline_kind}"):
        s3key = s3file.key
        s3obj = s3.Object(AWS_BUCKET, s3key)
        pipeline = json.load(s3obj.get()['Body'])
        yield pipeline

In [8]:
def pipeline2splits(pipeline: Pipeline) -> list[Pipeline]:
    splits = []
    pipeline_encoding = []
    for node in reversed(pipeline):
        annotation = node["annotation"]
        node_encoding = []
        for k, v in annotation.items():
            if isinstance(v, dict):
                for key in v:
                    node_encoding.append(f"{k}_{key} = {v[key]}")
            else:
                node_encoding.append(f"{k} = {v}")
        pipeline_encoding.append(', '.join(node_encoding))
        splits.append(copy.deepcopy(pipeline_encoding))
    return splits

In [9]:
pipeline_splits = list(itertools.chain.from_iterable(
    pipeline2splits(pipeline)
    for pipeline in read_pipelines("pipelines", "eda4sum", "annotated")
))

In [10]:
pipeline_encodings = [
    ';'.join(pipeline_split)
    for pipeline_split in pipeline_splits
]

In [11]:
vector_store = Chroma.from_texts(pipeline_encodings, openai_embeddings)

In [12]:
random_encoding = random.sample(pipeline_encodings, k=1)[0]
random_encoding

'total_length = 6, current_operator = by_distribution, delta_uniformity = 0, delta_novelty = 0, delta_diversity = 0, delta_score_galaxy = 0, delta_utilityWeights = [0.0, 0.0, 0.0], current_uniformity = -0.8914141720725897, current_novelty = 0.8154882106352771, current_diversity = 0.9766257880134802, current_score_galaxy = 6.158116326248841, current_utilityWeights = [0.4661917552908831, 0.4661917552908831, 0.06761648941823384], final_uniformity = -0.8914141720725897, final_novelty = 0.8154882106352771, final_diversity = 0.9766257880134802, final_score_galaxy = 6.158116326248841, final_utilityWeights = [0.4661917552908831, 0.4661917552908831, 0.06761648941823384], familiarity = 0.00011573672210455656, curiosity = 0.003340933378084866;total_length = 6, remaining_operators_by_distribution = 1, current_operator = by_neighbors, delta_uniformity = 0.42774606007400195, delta_novelty = 0.5257875184111461, delta_diversity = 1.1232296820097685, delta_score_galaxy = -8.108714831787927, delta_utili

In [13]:
vector_store.similarity_search(random_encoding)

[Document(page_content='total_length = 6, current_operator = by_distribution, delta_uniformity = 0, delta_novelty = 0, delta_diversity = 0, delta_score_galaxy = 0, delta_utilityWeights = [0.0, 0.0, 0.0], current_uniformity = -0.8914141720725897, current_novelty = 0.8154882106352771, current_diversity = 0.9766257880134802, current_score_galaxy = 6.158116326248841, current_utilityWeights = [0.4661917552908831, 0.4661917552908831, 0.06761648941823384], final_uniformity = -0.8914141720725897, final_novelty = 0.8154882106352771, final_diversity = 0.9766257880134802, final_score_galaxy = 6.158116326248841, final_utilityWeights = [0.4661917552908831, 0.4661917552908831, 0.06761648941823384], familiarity = 0.00011573672210455656, curiosity = 0.003340933378084866;total_length = 6, remaining_operators_by_distribution = 1, current_operator = by_neighbors, delta_uniformity = 0.42774606007400195, delta_novelty = 0.5257875184111461, delta_diversity = 1.1232296820097685, delta_score_galaxy = -8.10871