In [23]:
import os
import boto3
import numpy as np
import pandas as pd
import yaml
import time
from tqdm import tqdm

from vectorgeo.transfer import download_file
from vectorgeo import constants as c
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct

CHECK_DELAY = 60

# Load secrets (adjust the path as necessary)
secrets = yaml.load(open('secrets.yml'), Loader=yaml.FullLoader)

# Initialize S3 client
s3 = boto3.client('s3', aws_access_key_id=secrets['aws_access_key_id'], aws_secret_access_key=secrets['aws_secret_access_key'])

# Initialize Qdrant client
qdrant_client = QdrantClient(
    url=secrets['qdrant_url'], 
    api_key=secrets['qdrant_api_key']
)

# Specify your bucket name and prefix
bucket_name = c.S3_BUCKET
prefix = 'vectors/'

# Create set of files already run
checked_keys = set()

while True:

    # List all Parquet files in the S3 bucket with the specified prefix

    objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    print(f"Found {len(objects['Contents'])} files in S3")

    # Filter out the files that have already been run
    objects['Contents'] = [
        obj for obj in objects['Contents']
        if obj['Key'] not in checked_keys and '.parquet' in obj['Key']
    ]
    print(f"Found {len(objects['Contents'])} files to run")

    for obj in objects['Contents']:
        print(f"...Downloading {obj['Key']} from S3")
        basename = os.path.basename(obj['Key'])
        local_path = os.path.join(c.TMP_DIR, basename)
        download_file(obj['Key'], local_path)
        
        # Load the data into a Pandas DataFrame
        df = pd.read_parquet(local_path)
        
        # Extract vectors and other necessary information
        print(f"...Uploading {obj['Key']} to Qdrant")
        
        '''
        for df_piece in tqdm(np.array_split(df, 100)):
        
            points = [
                PointStruct(
                    id=row['id'],
                    vector=row['vector'],
                    payload={"location": {"lon": row['lng'], "lat": row['lat']}}
                )
                for _, row in df_piece.iterrows()
            ]

            # Batch the vectors and upload them to Qdrant
            qdrant_client.upsert(
                collection_name=c.QDRANT_COLLECTION_NAME,
                wait=True,
                points=points
            )

        # Add the file to the set of files that have already been run
        checked_keys.add(obj['Key'])
    time.sleep(CHECK_DELAY)
    '''

Found 56 files in S3
Found 55 files to run
...Downloading vectors/vector-upload-1694210549.parquet from S3
File /Users/madeline/Dropbox/projects/vectorgeo/tmp/vector-upload-1694210549.parquet already exists; skipping download
...Uploading vectors/vector-upload-1694210549.parquet to Qdrant
...Downloading vectors/vector-upload-1694210589.parquet from S3
File /Users/madeline/Dropbox/projects/vectorgeo/tmp/vector-upload-1694210589.parquet already exists; skipping download
...Uploading vectors/vector-upload-1694210589.parquet to Qdrant
...Downloading vectors/vector-upload-1694210632.parquet from S3
File /Users/madeline/Dropbox/projects/vectorgeo/tmp/vector-upload-1694210632.parquet already exists; skipping download
...Uploading vectors/vector-upload-1694210632.parquet to Qdrant
...Downloading vectors/vector-upload-1694210681.parquet from S3
File /Users/madeline/Dropbox/projects/vectorgeo/tmp/vector-upload-1694210681.parquet already exists; skipping download
...Uploading vectors/vector-uploa

KeyboardInterrupt: 

In [28]:
df.id[0]

TypeError: object of type 'numpy.int64' has no len()

### Creating index on location lat-longs

In [2]:
import boto3
import numpy as np
import pandas as pd
import yaml

from vectorgeo.transfer import download_file
from vectorgeo import constants as c
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct

# Load secrets (adjust the path as necessary)
secrets = yaml.load(open('secrets.yml'), Loader=yaml.FullLoader)

# Initialize S3 client

# Initialize Qdrant client
qdrant_client = QdrantClient(
    url=secrets['qdrant_url'], 
    api_key=secrets['qdrant_api_key']
)

qdrant_client.create_payload_index(collection_name=c.QDRANT_COLLECTION_NAME, 
                            field_name="location", 
                            field_schema="geo")


UnexpectedResponse: Unexpected Response: 503 (Service Unavailable)
Raw response content:
b'Service Unavailable'

In [3]:
secrets

{'aws_access_key_id': 'AKIA3LRNTQ3FM2PNDL5L',
 'aws_secret_access_key': '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',
 'qdrant_api_key': 'mmHlq5qN54C0k0Qw7SiXZ7ZyvhHQSLQAJrlEdwe9OxwjfLCLSGmUxg',
 'qdrant_url': 'https://74c434da-9bd9-4eac-85fd-50f89fe492ec.us-east-1-0.aws.cloud.qdrant.io:6333',
 'lambdalabs_api_key': 'secret_vectorgeo_98de521329dc4d2f822e36b0eda4972a.OInJ6G5thnnJWC7IgoLdFWz0h2kpqSY0'}

In [24]:
import os
import boto3
import numpy as np
import pandas as pd
import yaml
import time

from vectorgeo.transfer import download_file
from vectorgeo import constants as c
from qdrant_client import QdrantClient
from qdrant_client import models
from qdrant_client.http.models import Distance, VectorParams, PointStruct

CHECK_DELAY = 60
UPLOAD_DELAY = 4
WIPE_QDRANT = True
EMBED_DIM = 16

# Load secrets (adjust the path as necessary)
secrets = yaml.load(open('secrets.yml'), Loader=yaml.FullLoader)

# Initialize S3 client
s3 = boto3.client('s3', aws_access_key_id=secrets['aws_access_key_id'], aws_secret_access_key=secrets['aws_secret_access_key'])

# Initialize Qdrant client
qdrant_client = QdrantClient(
    url=secrets['qdrant_url'], 
    api_key=secrets['qdrant_api_key']
)

qdrant_client.recreate_collection(
                collection_name=c.QDRANT_COLLECTION_NAME,
                vectors_config=VectorParams(size=EMBED_DIM, distance=Distance.COSINE),
                hnsw_config=models.HnswConfig(on_disk=True),
            )

qdrant_client.create_payload_index(collection_name=c.QDRANT_COLLECTION_NAME, 
                            field_name="location", 
                            field_schema="geo")

# Specify your bucket name and prefix
bucket_name = c.S3_BUCKET
prefix = 'vectors/'

# Create set of files already run
checked_keys = set()

while True:
    time.sleep(CHECK_DELAY)

    # List all Parquet files in the S3 bucket with the specified prefix

    objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    print(f"Found {len(objects['Contents'])} files in S3")

    # Filter out the files that have already been run
    objects['Contents'] = [
        obj for obj in objects['Contents']
        if obj['Key'] not in checked_keys
        and obj['Key'].endswith('.parquet')
    ]
    print(f"Found {len(objects['Contents'])} files to run")

    for obj in objects['Contents']:
        print(f"...Downloading {obj['Key']} from S3")
        basename = os.path.basename(obj['Key'])
        local_path = os.path.join(c.TMP_DIR, basename)
        download_file(obj['Key'], local_path)
        
        # Load the data into a Pandas DataFrame
        df = pd.read_parquet(local_path)
        
        # Extract vectors and other necessary information
        print(f"...Uploading {obj['Key']} to Qdrant")
        for df_piece in np.array_split(df, 20):
            points = [
                PointStruct(
                    id=row['id'],
                    vector=row['vector'].tolist(),
                    payload={"location": {"lon": row['lng'], "lat": row['lat']}}
                )
                for _, row in df_piece.iterrows()
            ]
            
            # Batch the vectors and upload them to Qdrant
            # If we get a timeout error, back off up to T times with a delay that quadruples each time
            uploaded = False
            delay = 4
            while not uploaded:
                try:
                    qdrant_client.upsert(
                        collection_name=c.QDRANT_COLLECTION_NAME,
                        wait=True,
                        points=points
                    )
                    uploaded = True
                except Exception as e:
                    print(f"Failed to upload batch with exception {e}")
                    time.sleep(delay)
                    delay = delay * 4

            # To avoid wrecking the cluster, we wait a bit between batches
            time.sleep(UPLOAD_DELAY)

        # Add the file to the set of files that have already been run
        checked_keys.add(obj['Key'])


Found 56 files in S3
Found 55 files to run
...Downloading vectors/vector-upload-1694210549.parquet from S3
File /Users/madeline/Dropbox/projects/vectorgeo/tmp/vector-upload-1694210549.parquet already exists; skipping download
...Uploading vectors/vector-upload-1694210549.parquet to Qdrant
...Downloading vectors/vector-upload-1694210589.parquet from S3
File /Users/madeline/Dropbox/projects/vectorgeo/tmp/vector-upload-1694210589.parquet already exists; skipping download
...Uploading vectors/vector-upload-1694210589.parquet to Qdrant


KeyboardInterrupt: 

In [25]:
qdrant_client.search(
    collection_name=c.QDRANT_COLLECTION_NAME,
    query_vector=[0.]*16,
    limit=3,
)

[ScoredPoint(id=608703988149256191, version=1, score=0.0, payload={'location': {'lat': 47.86535960326038, 'lon': -121.24921980000592}}, vector=None),
 ScoredPoint(id=608703931408711679, version=1, score=0.0, payload={'location': {'lat': 48.06277081100897, 'lon': -121.53754895321975}}, vector=None),
 ScoredPoint(id=608704241552326655, version=1, score=0.0, payload={'location': {'lat': 46.90479095467285, 'lon': -121.76497632198864}}, vector=None)]

In [12]:
secrets

{'aws_access_key_id': 'AKIA3LRNTQ3FM2PNDL5L',
 'aws_secret_access_key': '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',
 'qdrant_api_key': 'AJ7U-q-9gy_9X3tPxelqvtNuTFYeDAD7wFv9z4ovH6XgR0-gqLTc0A',
 'qdrant_url': 'https://6008a4f3-409c-4e39-b1a0-ca5a1e4168b8.us-east-1-0.aws.cloud.qdrant.io:6333',
 'lambdalabs_api_key': 'secret_vectorgeo_98de521329dc4d2f822e36b0eda4972a.OInJ6G5thnnJWC7IgoLdFWz0h2kpqSY0'}

In [29]:
secrets['qdrant_url']

'https://97787781-48b4-4ff1-976b-aa435eb07d64.us-east-1-0.aws.cloud.qdrant.io:6333'

In [49]:
import psycopg2
import yaml

from psycopg2 import sql
from vectorgeo import constants as c

# Load secrets from the secrets.yml file
with open('secrets.yml', 'r') as file:
    secrets = yaml.safe_load(file)

# Database connection parameters
params = {
    'user': secrets['aurora_user'],
    'password': secrets['aurora_password'],
    'host': secrets['aurora_url'],
}

# Connect to the database
conn = psycopg2.connect(**params)
cur = conn.cursor()

# Activate the PostGIS extension
cur.execute("CREATE EXTENSION IF NOT EXISTS postgis;")

# Activate pgvector
cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")

# Create a new table with a geometric column to store the points
cur.execute(f"""
    CREATE TABLE IF NOT EXISTS vectorgeo (
        id BIGSERIAL PRIMARY KEY,
        name VARCHAR(50),
        geom GEOMETRY(Point, 4326),
        embedding vector({c.EMBED_DIM})
    );
""")


# Insert 10 points with lat-longs into the table
values = [(row['id'], row['lng'], row['lat'], row['vector'].tolist()) for _, row in df.iloc[0:50].iterrows()]

print("Inserting points into the table...")
cur.executemany(
    """INSERT INTO vectorgeo (id, geom, embedding) VALUES (%s, ST_SetSRID(ST_MakePoint(%s, %s), 4326), %s)
    ON CONFLICT (id) DO NOTHING;""", values)

# Build the IVFFlat index
print("Building the IVFFlat index...")
cur.execute("""
            CREATE INDEX ON vectorgeo USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100)
            """)


# Run a spatial query to find points within a certain distance of a reference point
reference_point = (0.0, 0.0)  # replace with your reference point
distance = 1000.0  # replace with your distance in degrees

print("Running the spatial query...")
query_list = [0.]*16
cur.execute(
    sql.SQL(f"""
        SELECT id, ST_AsText(geom)
        FROM vectorgeo
        WHERE ST_DWithin(
            geom,
            ST_SetSRID(ST_MakePoint(%s, %s), 4326),
            %s
        )
        ORDER BY embedding <-> '{query_list}' LIMIT 200;
    """),
    [*reference_point, distance]
)

# Fetch and print the results of the spatial query
results = cur.fetchall()
for result in results:
    print(result)

# Drop the table
cur.execute("DROP TABLE vectorgeo;")

# Commit the transaction and close the connection
conn.commit()
cur.close()
conn.close()


Inserting points into the table...
Building the IVFFlat index...
Running the spatial query...
(608215562471866367, 'POINT(-150.14593927327465 69.82183687729858)')
(608215320896733183, 'POINT(-150.5563830685692 69.2568533040199)')
(608215324755492863, 'POINT(-150.5410198735205 69.27856307355599)')
(608215320376639487, 'POINT(-150.60227572035132 69.19173440897931)')
(608215320879955967, 'POINT(-150.5870108888723 69.21343896092971)')
(608215320829624319, 'POINT(-150.57171337257864 69.23514526287295)')
(608215324822601727, 'POINT(-150.52562368368402 69.30027456064136)')
(608215288684478463, 'POINT(-150.66301024145554 69.10493391601227)')
(608215320460525567, 'POINT(-150.64787512860994 69.12663136060081)')
(608215320309530623, 'POINT(-150.61750796949553 69.17003161778588)')
(608215562119544831, 'POINT(-150.19449353168665 69.75659307134795)')
(608215558680215551, 'POINT(-150.06429929547963 69.93060515993342)')
(608215241926377471, 'POINT(-150.40125376088037 69.47402697987187)')
(608215561901

In [91]:
import json
import psycopg2

aurora_table = 'vectorgeo'
limit = 10
lng, lat = -150.66301024145554, 69.10493391601227

vector_query = f"SELECT embedding FROM {aurora_table} WHERE id = {positive_id}"

# Connect to the database
conn = psycopg2.connect(**params)
cur = conn.cursor()

# Fetch the vector from the database
cur.execute(vector_query)
vector = json.loads(cur.fetchone()[0])

# Get the nearest neighbors within X degrees of the reference point
cur.execute(
    f"""
    SELECT id
        , 1 - (embedding <=> '{vector}') AS score
        , embedding
    FROM {aurora_table}
    WHERE ST_DWithin(
        geom,
        ST_SetSRID(ST_MakePoint({lng}, {lat}), 4326),
        100
    )
    ORDER BY embedding <=> '{vector}' LIMIT {limit};
    """
)

# Fetch and print the results of the spatial query
results = cur.fetchall()

ids, scores, embeddings = zip(*results)
ids = [hex(id)[2:] for id in ids]
embeddings = [json.loads(embedding) for embedding in embeddings]

#print(ids, scores, embeddings)


In [90]:
vector

[0.12960789,
 0.034281656,
 0.10103667,
 0.014115579,
 -0.16742846,
 0.20054632,
 0.1104777,
 -0.14245045,
 -0.12181454,
 -0.13199367,
 0.006656356,
 -0.044093132,
 0.13015968,
 -0.08061195,
 0.10182471,
 0.09762533]

In [86]:
cur.execute(
    f"""
    SELECT id
        , embedding
    FROM {aurora_table}
    LIMIT 10
    """
)
results = cur.fetchall()
ids, embeddings = zip(*results)
ids = [hex(id)[2:] for id in ids]
embeddings = [json.loads(embedding) for embedding in embeddings]

print([x[0] for x in embeddings])

[-0.008662209, -0.004541181, -0.010603402, -0.00079894066, -0.027190946, -0.042524837, 0.014726669, -0.03498197, 0.011868864, -0.020180132]


In [87]:
embeddings[0]

[-0.008662209,
 0.2018602,
 -0.14934546,
 -0.045412205,
 -0.045075517,
 -0.2010995,
 0.054314613,
 -0.04028077,
 0.1142025,
 -0.14177047,
 0.053414322,
 -0.039181545,
 0.0287278,
 0.10317484,
 -0.20916164,
 -0.018649321]

In [80]:
[x[0] for x in embeddings]

['[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '[',
 '['

In [51]:
vector

'[0.12960789,0.034281656,0.10103667,0.014115579,-0.16742846,0.20054632,0.1104777,-0.14245045,-0.12181454,-0.13199367,0.006656356,-0.044093132,0.13015968,-0.08061195,0.10182471,0.09762533]'

In [None]:
def retrieve_nearest(id, limit):
    

In [47]:
values

[(608215288684478463,
  -150.66301024145554,
  69.10493391601227,
  [0.1296078860759735,
   0.0342816561460495,
   0.1010366678237915,
   0.014115579426288605,
   -0.16742846369743347,
   0.20054632425308228,
   0.11047770082950592,
   -0.1424504518508911,
   -0.1218145415186882,
   -0.13199366629123688,
   0.006656356155872345,
   -0.04409313201904297,
   0.1301596760749817,
   -0.080611951649189,
   0.10182470828294754,
   0.09762533009052277]),
 (608215320460525567,
  -150.64787512860994,
  69.12663136060081,
  [0.1296078860759735,
   0.0342816561460495,
   0.1010366678237915,
   0.014115579426288605,
   -0.16742846369743347,
   0.20054632425308228,
   0.11047770082950592,
   -0.1424504518508911,
   -0.1218145415186882,
   -0.13199366629123688,
   0.006656356155872345,
   -0.04409313201904297,
   0.1301596760749817,
   -0.080611951649189,
   0.10182470828294754,
   0.09762533009052277]),
 (608215320527634431,
  -150.6327077383668,
  69.14833059808838,
  [-0.0413324311375618,
   -0.1

In [48]:
type(608215320460525567)

int