벡터 db 초기화 코드

In [52]:
import os
from dotenv import load_dotenv
import weaviate
import openai

# .env 파일 로드
load_dotenv()

# OpenAI API Key 설정
OPENAI_KEY = os.getenv("OPENAI_KEY")

# API Key 설정
if OPENAI_KEY is None:
    raise ValueError("OPENAI_KEY is not set in the .env file.")
openai.api_key = OPENAI_KEY

# movie_list.txt에서 ID 읽기
with open('movie_list.txt', 'r') as file:
    ids = [line.strip() for line in file.readlines() if line.strip()]
    
host = os.getenv("WEAVIATE_HOST", "localhost")  # 단일 host 설정
http_port = int(os.getenv("WEAVIATE_HTTP_PORT", 8081)) # int casting
grpc_port = int(os.getenv("WEAVIATE_GRPC_PORT", 50052))

print(f"Server Address and Ports:")
print(f"Host: {host}")
print(f"HTTP Port: {http_port}")
print(f"GRPC Port: {grpc_port}")

# Weaviate 클라이언트 설정
client = weaviate.WeaviateClient(
    connection_params=weaviate.ConnectionParams.from_params(
        http_host=host,
        http_port=http_port,
        http_secure=False,
        grpc_host=host,  # 동일한 host 재사용
        grpc_port=grpc_port,
        grpc_secure=False,
    )
)
class_name = "MovieSynopsis"

client.connect()

# 기존 클래스 삭제
if client.collections.exists(class_name):
    print(f"Deleting existing class '{class_name}' and its data...")
    client.collections.delete(class_name)
    print(f"Class '{class_name}' deleted.")


Server Address and Ports:
Host: localhost
HTTP Port: 8080
GRPC Port: 50051
Deleting existing class 'MovieSynopsis' and its data...
Class 'MovieSynopsis' deleted.


벡터 db 구축 코드

In [None]:
import os
from dotenv import load_dotenv
import json
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.vector_stores.weaviate import WeaviateVectorStore
import weaviate
from weaviate.classes import config
from llama_index.core.query_engine import RetrieverQueryEngine
import openai
from weaviate.classes.query import Filter

# .env 파일 로드
load_dotenv()

# OpenAI API Key 설정
OPENAI_KEY = os.getenv("OPENAI_KEY")

# API Key 설정
if OPENAI_KEY is None:
    raise ValueError("OPENAI_KEY is not set in the .env file.")
openai.api_key = OPENAI_KEY

# movie_list.txt에서 ID 읽기
with open('movie_list.txt', 'r') as file:
    ids = [line.strip() for line in file.readlines() if line.strip()]
    
host = os.getenv("WEAVIATE_HOST", "localhost")  # 단일 host 설정
http_port = int(os.getenv("WEAVIATE_HTTP_PORT", 8081)) # int casting
grpc_port = int(os.getenv("WEAVIATE_GRPC_PORT", 50052))

print(f"Server Address and Ports:")
print(f"Host: {host}")
print(f"HTTP Port: {http_port}")
print(f"GRPC Port: {grpc_port}")

# Weaviate 클라이언트 설정
client = weaviate.WeaviateClient(
    connection_params=weaviate.ConnectionParams.from_params(
        http_host=host,
        http_port=http_port,
        http_secure=False,
        grpc_host=host,  # 동일한 host 재사용
        grpc_port=grpc_port,
        grpc_secure=False,
    )
)
class_name = "MovieSynopsis"

client.connect()

# 클래스가 존재하지 않으면 생성
if not client.collections.exists(class_name):
    client.collections.create(
        name=class_name,
        properties=[
            config.Property(
                name="content",
                data_type=config.DataType.TEXT,
                module_config={
                    "text2vec-openai": {
                        "vectorize": True  # content만 벡터화
                    }
                }
            ),
            config.Property(
                name="movie_id",
                data_type=config.DataType.TEXT,
                module_config={
                    "text2vec-openai": {
                        "vectorize": False  # movie_id는 벡터화하지 않음
                    }
                }
            )
        ],
        vectorizer_config=config.Configure.Vectorizer.text2vec_openai()
    )

# 데이터 추가
collection = client.collections.get(class_name)
for id in ids:
    file_path = f'movie_synopsis/{id}.json'

    if not os.path.exists(file_path):
        print(f"파일 {file_path}이 존재하지 않습니다. 다음 ID로 이동합니다.")
        continue

    # JSON 파일 열기
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    # synopsis 부분 추출
    synopsis = data.get("synopsis", "Synopsis not found.")
    if synopsis == "Synopsis not found.":
        print(f"ID {id}: Synopsis not found. 다음 ID로 이동합니다.")
        continue

    # "\n\n" 기준으로 텍스트 분리
    text_segments = synopsis.split("\n\n")

    for segment in text_segments:
        if not segment.strip():
            continue  # 빈 텍스트는 건너뜀

        response = collection.query.fetch_objects(
            filters=Filter.by_property("content").equal(segment)
        )

        if response.objects:
            continue

        # Weaviate에 데이터 추가
        try:
            collection.data.insert({
                "content": segment,
                "movie_id": id
            })
        except Exception as e:
            print(f"ID {id}: 데이터 저장 중 오류 발생: {e}")
            print(f"오류 타입: {type(e)}")
            import traceback
            print(traceback.format_exc())

    print(f"ID {id}: 데이터 저장 완료.")

# WeaviateVectorStore 생성
vector_store = WeaviateVectorStore(weaviate_client=client, index_name=class_name)

# StorageContext 및 VectorStoreIndex 생성
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_vector_store(vector_store)

# Retriever 설정
retriever = index.as_retriever(retriever_mode="default")

# Query Engine 생성
query_engine = RetrieverQueryEngine(retriever=retriever)

Server Address and Ports:
Host: localhost
HTTP Port: 8080
GRPC Port: 50051
ID tt0241527: 데이터 저장 완료.
ID tt0295297: 데이터 저장 완료.
ID tt0304141: 데이터 저장 완료.
ID tt0330373: 데이터 저장 완료.
ID tt0373889: 데이터 저장 완료.
ID tt0417741: 데이터 저장 완료.
ID tt0926084: 데이터 저장 완료.
ID tt1201607: 데이터 저장 완료.


전체 벡터 db 확인 코드

In [None]:
# After your existing code, add the following:

# Fetch all objects with their vectors
response = collection.query.fetch_objects(
    include_vector=True,
    limit=10000  # Adjust this value based on your data size
)

# Print the vectors
for obj in response.objects:
    print(f"Object ID: {obj.uuid}")
    print(f"Movie ID: {obj.properties['movie_id']}")
    print(f"Content: {obj.properties['content'][:50]}...")  # Print first 50 characters
    #print(f"Vector: {obj.vector}...")  # Print first 5 elements of the vector
    print("---")

# Print total number of objects retrieved
print(f"Total objects retrieved: {len(response.objects)}")

Object ID: 08bc0172-d459-4875-ad56-92173d945e4c
Movie ID: tt0304141
Content: Inside, Harry finally confronts Sirius Black. As a...
Vector: {'default': [0.003163114655762911, -0.002222312381491065, -0.016006257385015488, 0.009551036171615124, -0.01243934128433466, -0.06730031222105026, 0.035714033991098404, 0.059583209455013275, 0.006415963172912598, -0.03504103049635887, 0.014301315881311893, 0.02716689184308052, -0.019438574090600014, 0.05931400880217552, -0.007733927574008703, -0.010067004710435867, -0.006853415165096521, 0.046616680920124054, -0.06842198222875595, 0.0017343851504847407, -0.011811205185949802, 0.04183835908770561, -0.003684692084789276, -0.009607119485735893, -0.04085128754377365, -0.020044276490807533, -0.0034996161703020334, -0.04156915843486786, 0.06335202604532242, 0.025933053344488144, -0.024452446028590202, -0.017139146104454994, 0.045786645263433456, -0.009500560350716114, -0.013101126998662949, -0.022444654256105423, -0.026112521067261696, -0.0424440652132034

movie_id로 filtering한 벡터 db 확인

In [60]:
# Filter by movie_id = 'tt0241527'
filter_condition = Filter.by_property("movie_id").equal("tt0241527")

# Fetch objects with the specified filter
response = collection.query.fetch_objects(
    filters=filter_condition,
    include_vector=True,
    limit=100  # Adjust the limit based on your needs
)

# Print the filtered results
if response.objects:
    for obj in response.objects:
        print(f"Object ID: {obj.uuid}")
        print(f"Movie ID: {obj.properties['movie_id']}")
        print(f"Content: {obj.properties['content'][:50]}...")  # Print first 50 characters
        print("---")

    # Print total number of filtered objects
    print(f"Total objects retrieved for movie_id 'tt0241527': {len(response.objects)}")
else:
    print("No objects found for movie_id 'tt0241527'.")

Object ID: 741802e7-ed1a-4a78-a701-abba9b66a708
Movie ID: tt0241527
Content: Lord Voldemort, an evil and powerful dark wizard, ...
---
Object ID: fcdf0af4-d7d7-4dcb-9e32-f53a6927881c
Movie ID: tt0241527
Content: Shortly before Harry's 11th birthday, he receives ...
---
Object ID: 86a6d2f0-0c45-4b83-91ed-d62f41d18dbe
Movie ID: tt0241527
Content: Upon arrival, the Sorting Hat places Harry, Ron an...
---
Object ID: c95c0701-3024-4fcb-a546-8c48b5c44441
Movie ID: tt0241527
Content: Harry, Ron, Hermione explore Hogwarts late at nigh...
---
Object ID: 2e1bf986-c6e9-48f2-b495-ff70fb7c414b
Movie ID: tt0241527
Content: At Christmas, Harry receives an Invisibility Cloak...
---
Object ID: 63067ea4-a50c-4b64-a097-4eb318ebd7bd
Movie ID: tt0241527
Content: Harry sees Snape trying to get information from Qu...
---
Object ID: 86dc00dc-c965-46c4-8f78-0ba6b9741317
Movie ID: tt0241527
Content: Harry, Hermione, Ron and Draco are caught out late...
---
Object ID: 6cf36a37-428b-496d-a70c-5007d6331d7b
Movie I

In [56]:
searchword="harry"

In [57]:
from weaviate.classes.query import MetadataQuery

response = collection.query.bm25(
            query=searchword,
            return_metadata=MetadataQuery(score=True),
            query_properties=["content", "movie_id"],
            limit=10
        )
res = []
# 오브젝트가 있으면
if response.objects:
   for object in response.objects:
      res.append(object.properties) # 반환 데이터에 추가

In [58]:
res

[{'content': "Harry wakes up in the hospital wing. Dumbledore reveals to Harry that Harry's mother died to protect Harry as an infant. Her pure, loving sacrifice provides Harry with an ancient magical protection from Voldemort's lethal spells and also prevents Voldemort from touching Harry without suffering terribly. Dumbledore also says that the Sorcerer's Stone has been destroyed to prevent future attempts by Voldemort to steal it.",
  'movie_id': 'tt0241527'},
 {'content': "Harry learns from Snape's memories that Snape loved Harry's late mother, Lily, but despised his father, James, who had bullied him. Following her death, Snape worked secretly with Dumbledore to protect Harry from Voldemort because of his deep feelings for Lily. Harry also learns that Dumbledore's death at Snape's hands was planned between them, and that the Patronus doe he saw in the woods that led him to the sword had been conjured by Snape. Harry discovers that he himself became a Horcrux when Voldemort origina

쿼리로 유사한 문장 찾기

In [43]:
response = query_engine.query("harry potter")
response

Response(response="The six individuals enter the Department of Mysteries where they come across a bottled prophecy involving Harry and Voldemort. They are then ambushed by Death Eaters, including Lucius Malfoy and Bellatrix Lestrange. Lucius discloses that Harry's vision of Sirius being tortured was a ploy to lure him in. He tries to convince Harry to hand over the prophecy, claiming it holds the secret of why Voldemort failed to kill Harry as a baby. Despite this, Harry refuses, leading to a confrontation between Dumbledore's Army and the Death Eaters.", source_nodes=[NodeWithScore(node=TextNode(id_='eda816ce-6f12-49b9-a5a5-f9c36d69a706', embedding=[-0.019546974450349808, 0.024576416239142418, -0.04863588884472847, -0.023348672315478325, -0.04105403274297714, -0.022357862442731857, 0.03418297320604324, 0.04014937952160835, -0.005454844329506159, -0.05307300016283989, 0.01839461922645569, -0.008228037506341934, -0.006973369978368282, 0.01683301478624344, 0.0068441336043179035, -0.00262

전체 정보 확인 (embedding 제외)

In [32]:
collection = client.collections.get(class_name)
for item in collection.iterator():
    print(item.uuid, item.properties)

0044215b-2ff5-407c-bb8d-504a94f09b6e {'content': "Ron becomes Keeper of the Gryffindor Quidditch team (after beating the strapping Cormac, whom Hermoine made sure he lost by using her spells to disorient him) and forms a romantic relationship with Lavender Brown (Jessie Cave), upsetting Hermione. Harry consoles Hermione, revealing that he now has feelings for Ron's younger sister, Ginny Weasley (Bonnie Wright). Harry gives his Liquid Luck potion to Ron for his first Quidditch match. Ron wins the game for his team and turns into an overnight hero.", 'movie_id': 'tt0417741'}
036c6400-28bb-4f77-b612-54f41389fc6d {'content': "Harry soon finds he is the unwanted center of attention of three people: the vain new Defense Against the Dark Arts Professor, Gilderoy Lockhart (Kenneth Branagh), admirer Colin Creevey (Hugh Mitchell), and Ron's sister, Ginny Weasley (Bonnie Wright), who fancies Harry. Events take a turn for the worse when the Chamber of Secrets is opened and a monster stalks the cas