From 3ca8dda3bfeca80494f82d6408b0f1424c3c9015 Mon Sep 17 00:00:00 2001 From: LJ Date: Wed, 14 May 2025 22:48:32 -0700 Subject: [PATCH] chore(example): consolidate example for S3 --- examples/amazon_s3_text_embedding/README.md | 49 +++---------------- examples/amazon_s3_text_embedding/main.py | 4 +- .../amazon_s3_text_embedding/pyproject.toml | 6 +++ .../amazon_s3_text_embedding/requirements.txt | 3 -- 4 files changed, 15 insertions(+), 47 deletions(-) create mode 100644 examples/amazon_s3_text_embedding/pyproject.toml delete mode 100644 examples/amazon_s3_text_embedding/requirements.txt diff --git a/examples/amazon_s3_text_embedding/README.md b/examples/amazon_s3_text_embedding/README.md index 930af08b..d79a73b9 100644 --- a/examples/amazon_s3_text_embedding/README.md +++ b/examples/amazon_s3_text_embedding/README.md @@ -8,41 +8,8 @@ Before running the example, you need to: 1. [Install Postgres](https://cocoindex.io/docs/getting_started/installation#-install-postgres) if you don't have one. -2. Prepare for Amazon S3: - - - **Create an Amazon S3 bucket:** - - Go to the [AWS S3 Console](https://s3.console.aws.amazon.com/s3/home) and click **Create bucket**. Give it a unique name and choose a region. - - Or, use the AWS CLI: - ```sh - aws s3 mb s3://your-s3-bucket-name - ``` - - - **Upload your files to the bucket:** - - In the AWS Console, click your bucket, then click **Upload** and add your `.md`, `.txt`, `.docx`, or other files. - - Or, use the AWS CLI: - ```sh - aws s3 cp localfile.txt s3://your-s3-bucket-name/ - aws s3 cp your-folder/ s3://your-s3-bucket-name/ --recursive - ``` - - - **Set up AWS credentials:** - - The easiest way is to run: - ```sh - aws configure - ``` - Enter your AWS Access Key ID, Secret Access Key, region (e.g., `us-east-1`), and output format (`json`). - - This creates a credentials file at `~/.aws/credentials` and config at `~/.aws/config`. - - Alternatively, you can set environment variables: - ```sh - export AWS_ACCESS_KEY_ID=your-access-key-id - export AWS_SECRET_ACCESS_KEY=your-secret-access-key - export AWS_DEFAULT_REGION=us-east-1 - ``` - - If running on AWS EC2 or Lambda, you can use an [IAM role](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles.html) with S3 read permissions. - - - **(Optional) Specify a prefix** to restrict to a subfolder in the bucket by setting `AMAZON_S3_PREFIX` in your `.env`. - - See [AWS S3 documentation](https://docs.aws.amazon.com/AmazonS3/latest/userguide/Welcome.html) for more details. +2. Prepare for Amazon S3. + See [Setup for AWS S3](https://cocoindex.io/docs/ops/sources#setup-for-amazon-s3) for more details. 3. Create a `.env` file with your Amazon S3 bucket name and (optionally) prefix. Start from copying the `.env.example`, and then edit it to fill in your bucket name and prefix. @@ -59,7 +26,7 @@ Before running the example, you need to: # Amazon S3 Configuration AMAZON_S3_BUCKET_NAME=your-bucket-name - AMAZON_S3_PREFIX=optional/prefix/path + AMAZON_S3-SQS_QUEUE_URL=https://sqs.us-west-2.amazonaws.com/123456789/S3ChangeNotifications ``` ## Run @@ -67,19 +34,19 @@ Before running the example, you need to: Install dependencies: ```sh -uv pip install -r requirements.txt +pip install -e . ``` Setup: ```sh -uv run main.py cocoindex setup +python main.py cocoindex setup ``` Run: ```sh -uv run main.py +python main.py ``` During running, it will keep observing changes in the Amazon S3 bucket and update the index automatically. @@ -92,13 +59,13 @@ CocoInsight is in Early Access now (Free) 😊 You found us! A quick 3 minute vi Run CocoInsight to understand your RAG data pipeline: ```sh -uv run main.py cocoindex server -ci +python main.py cocoindex server -ci ``` You can also add a `-L` flag to make the server keep updating the index to reflect source changes at the same time: ```sh -uv run main.py cocoindex server -ci -L +python main.py cocoindex server -ci -L ``` Then open the CocoInsight UI at [https://cocoindex.io/cocoinsight](https://cocoindex.io/cocoinsight). \ No newline at end of file diff --git a/examples/amazon_s3_text_embedding/main.py b/examples/amazon_s3_text_embedding/main.py index 55400e7f..d3730399 100644 --- a/examples/amazon_s3_text_embedding/main.py +++ b/examples/amazon_s3_text_embedding/main.py @@ -1,7 +1,6 @@ from dotenv import load_dotenv import cocoindex -import datetime import os @cocoindex.flow_def(name="AmazonS3TextEmbedding") @@ -19,8 +18,7 @@ def amazon_s3_text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scop prefix=prefix, included_patterns=["*.md", "*.txt", "*.docx"], binary=False, - sqs_queue_url=sqs_queue_url), - refresh_interval=datetime.timedelta(minutes=1)) + sqs_queue_url=sqs_queue_url)) doc_embeddings = data_scope.add_collector() diff --git a/examples/amazon_s3_text_embedding/pyproject.toml b/examples/amazon_s3_text_embedding/pyproject.toml new file mode 100644 index 00000000..a5a58ccd --- /dev/null +++ b/examples/amazon_s3_text_embedding/pyproject.toml @@ -0,0 +1,6 @@ +[project] +name = "amazon-s3-text-embedding" +version = "0.1.0" +description = "Simple example for cocoindex: build embedding index based on Amazon S3 files." +requires-python = ">=3.11" +dependencies = ["cocoindex>=0.1.35", "python-dotenv>=1.0.1"] diff --git a/examples/amazon_s3_text_embedding/requirements.txt b/examples/amazon_s3_text_embedding/requirements.txt deleted file mode 100644 index d4e76dff..00000000 --- a/examples/amazon_s3_text_embedding/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -cocoindex -python-dotenv -boto3 \ No newline at end of file