In [1]:
# This get the RAPIDS-Colab install files and test check your GPU.  Run this and the next cell only.
# Please read the output of this cell.  If your Colab Instance is not RAPIDS compatible, it will warn you and give you remediation steps.
# Source: https://colab.research.google.com/github/rapidsai-community/showcase/blob/main/getting_started_tutorials/rapids-pip-colab-template.ipynb?ncid=so-othe-933049-vt27#scrollTo=B0C8IV5TQnjN
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 562, done.[K
remote: Counting objects: 100% (293/293), done.[K
remote: Compressing objects: 100% (191/191), done.[K
remote: Total 562 (delta 185), reused 145 (delta 100), pack-reused 269 (from 1)[K
Receiving objects: 100% (562/562), 181.33 KiB | 982.00 KiB/s, done.
Resolving deltas: 100% (287/287), done.
Collecting pynvml
  Downloading pynvml-12.0.0-py3-none-any.whl.metadata (5.4 kB)
Collecting nvidia-ml-py<13.0.0a0,>=12.0.0 (from pynvml)
  Downloading nvidia_ml_py-12.560.30-py3-none-any.whl.metadata (8.6 kB)
Downloading pynvml-12.0.0-py3-none-any.whl (26 kB)
Downloading nvidia_ml_py-12.560.30-py3-none-any.whl (40 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 40.5/40.5 kB 2.8 MB/s eta 0:00:00
Installing collected packages: nvidia-ml-py, pynvml
Successfully installed nvidia-ml-py-12.560.30 pynvml-12.0.0
Installing RAPIDS remaining 24.10.* libraries
Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com

In [2]:
import cuml
from cuml.manifold.umap import UMAP

# Example usage
import numpy as np

# Generate some random data
data = np.random.rand(30, 10)

# Create UMAP instance and fit-transform data
umap_model = UMAP(n_neighbors=5, n_components=2)
embedding = umap_model.fit_transform(data)

print(embedding)

[[  -3.3076286 -329.4828   ]
 [ 421.31097    350.51996  ]
 [ 528.0796    -531.94885  ]
 [ 402.76605    346.5136   ]
 [ 412.8604     455.8838   ]
 [-568.80176   -629.6509   ]
 [ 322.27875   -407.56805  ]
 [-498.20612    524.3797   ]
 [-397.36652   -288.4486   ]
 [ 216.7511     317.84943  ]
 [ 485.69543      7.138199 ]
 [ 445.92474    432.65454  ]
 [ 677.6245     123.77641  ]
 [ -90.293      -59.36414  ]
 [-556.4442    -506.6623   ]
 [ 185.13577    101.21109  ]
 [ -70.709785    98.12842  ]
 [ 394.16934    464.10077  ]
 [-601.57666   -527.2108   ]
 [-181.69254    163.0796   ]
 [-302.29175    347.2881   ]
 [ 208.4156    -183.62958  ]
 [ 787.3817    -845.0581   ]
 [-480.8905     628.7495   ]
 [-235.39966   -319.81805  ]
 [-367.9477    -366.27594  ]
 [-827.90857    819.9824   ]
 [ 464.32294   -469.25244  ]
 [-520.6953     110.65723  ]
 [  87.19735    242.33289  ]]


In [12]:
# https://stackoverflow.com/questions/56081324/why-are-google-colab-shell-commands-not-working
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

!pip install pgvector

import pandas as pd
import numpy as np
from google.colab import drive, userdata
from tqdm.auto import tqdm
from datetime import datetime
from sqlalchemy import create_engine, text, exists
from sqlalchemy.orm import sessionmaker
from pgvector.sqlalchemy import HALFVEC
import sqlalchemy as sa
from sqlalchemy.ext.declarative import declarative_base
from openai import OpenAI
import time
import logging
from typing import List, Optional

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Database configuration
DATABASE_URL = userdata.get('DATABASE_URL')
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
CHUNK_SIZE = 2048
MAX_ROWS = 500000
EMBEDDING_MODEL = "text-embedding-3-small"
EMBEDDING_DIMENSIONS = 512
MAX_RETRIES = 3
RETRY_DELAY = 5  # seconds

# SQLAlchemy setup
engine = create_engine(DATABASE_URL)
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
Base = declarative_base()

class QuoteDB(Base):
    __tablename__ = "motivational_quotes"

    id = sa.Column(sa.Integer, primary_key=True, index=True)
    author = sa.Column(sa.String)
    book = sa.Column(sa.String)
    text = sa.Column(sa.String)
    date_created = sa.Column(sa.DateTime)
    embeddings = sa.Column(HALFVEC(EMBEDDING_DIMENSIONS))
    reduced_embeddings = sa.Column(HALFVEC(2))

def generate_embeddings(texts: List[str], client: OpenAI, retry_count: int = 0) -> Optional[List[List[float]]]:
    """Generate embeddings with retry logic"""
    try:
        response = client.embeddings.create(
            model=EMBEDDING_MODEL,
            input=texts,
            dimensions=EMBEDDING_DIMENSIONS
        )
        return [item.embedding for item in response.data]
    except Exception as e:
        if retry_count < MAX_RETRIES:
            logger.warning(f"Error generating embeddings: {e}. Retrying in {RETRY_DELAY} seconds...")
            time.sleep(RETRY_DELAY)
            return generate_embeddings(texts, client, retry_count + 1)
        else:
            logger.error(f"Failed to generate embeddings after {MAX_RETRIES} attempts: {e}")
            return None

def get_existing_quotes():
    """Get set of existing quote texts to avoid duplicates"""
    with SessionLocal() as db:
        existing_quotes = db.query(QuoteDB.text).all()
        return {quote.text for quote in existing_quotes}

def process_chunk(chunk: pd.DataFrame, client: OpenAI, existing_quotes: set) -> None:
    """Process a chunk of quotes and store in database"""
    # Filter out quotes that already exist
    new_quotes = chunk[~chunk['Quote'].isin(existing_quotes)]
    if new_quotes.empty:
        return

    texts = new_quotes['Quote'].tolist()
    embeddings = generate_embeddings(texts, client)

    if embeddings is None:
        logger.error(f"Skipping chunk due to embedding generation failure")
        return

    quotes = []
    for (_, row), emb in zip(new_quotes.iterrows(), embeddings):
        quote = QuoteDB(
            author=row['Author'],
            book=row['Book'] if 'Book' in row else None,
            text=row['Quote'],
            date_created=datetime.now(),
            embeddings=emb,
            reduced_embeddings=None
        )
        quotes.append(quote)

    try:
        with SessionLocal() as db:
            db.add_all(quotes)
            db.commit()
            # Update existing_quotes set with newly added quotes
            existing_quotes.update(new_quotes['Quote'])
    except Exception as e:
        logger.error(f"Error storing quotes in database: {e}")

def fit_reducer(embeddings: List[List[float]]) -> np.ndarray:
    """Fit UMAP reducer on embeddings"""
    embeddings = np.array(embeddings)
    reducer = UMAP(
        n_neighbors=min(32, len(embeddings) - 1), # number of neighbors must be smaller than the graph degree
        n_components=2,
        build_algo="nn_descent",
        build_kws={"nnd_graph_degree": 64}
    )
    return reducer.fit_transform(embeddings)

def update_reduced_embeddings(quotes, reduced_embeddings):
    """Update reduced embeddings in chunks, using batch updates for each chunk"""
    for i in tqdm(range(0, len(quotes), CHUNK_SIZE), desc="Updating database"):
        chunk_quotes = quotes[i:i + CHUNK_SIZE]
        chunk_reduced = reduced_embeddings[i:i + CHUNK_SIZE]

        # Prepare batch data
        quote_ids = [quote.id for quote in chunk_quotes]
        # Format the embeddings as strings in pgvector format: [x,y]
        reduced_embeddings_list = [
            f"[{','.join(str(x) for x in emb)}]"
            for emb in chunk_reduced
        ]

        retries = 0
        while retries < MAX_RETRIES:
            try:
                with SessionLocal() as db:
                    update_query = text("""
                        UPDATE motivational_quotes
                        SET reduced_embeddings = data_table.reduced_emb::vector
                        FROM (
                            SELECT unnest(:ids) as id,
                                   unnest(:embeddings) as reduced_emb
                        ) as data_table
                        WHERE motivational_quotes.id = data_table.id
                    """)

                    db.execute(
                        update_query,
                        {
                            'ids': quote_ids,
                            'embeddings': reduced_embeddings_list
                        }
                    )
                    db.commit()
                    break

            except Exception as e:
                retries += 1
                if retries == MAX_RETRIES:
                    logger.error(f"Failed to update reduced embeddings for chunk {i}: {e}")
                    raise
                else:
                    logger.warning(f"Error updating reduced embeddings, retry {retries}: {e}")
                    time.sleep(RETRY_DELAY)



  Base = declarative_base()


In [4]:
# Mount Google Drive
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/Quoteverse/quotes_422k.csv'

df = pd.read_csv(file_path, nrows=MAX_ROWS)
print(len(df))
print(df.head())

df = df.dropna(subset=['Quote'])
df = df.where(pd.notnull(df), None)

Mounted at /content/drive
422024
                                               Quote                  Author  \
0  I'm selfish, impatient and a little insecure. ...          Marilyn Monroe   
1  You've gotta dance like there's nobody watchin...       William W. Purkey   
2  You know you're in love when you can't fall as...               Dr. Seuss   
3  A friend is someone who knows all about you an...          Elbert Hubbard   
4  Darkness cannot drive out darkness: only light...  Martin Luther King Jr.   

                                                Book  \
0                                                NaN   
1                                                NaN   
2                                                NaN   
3                                                NaN   
4  A Testament of Hope: The Essential Writings an...   

                                                Tags  
0  attributed-no-source, best, life, love, mistak...  
1  dance, heaven, hurt, inspirational, 

In [5]:
print("Initializing...")
client = OpenAI(api_key=OPENAI_API_KEY)

# Get existing quotes to avoid duplicates
existing_quotes = get_existing_quotes()
print(f"Found {len(existing_quotes)} existing quotes in database")

# Read and process CSV in chunks
print("\nProcessing CSV file in chunks...")
chunk_iterator = pd.read_csv(file_path, chunksize=CHUNK_SIZE, nrows=MAX_ROWS)
total_chunks = sum(1 for _ in pd.read_csv(file_path, chunksize=CHUNK_SIZE, nrows=MAX_ROWS))

for chunk in tqdm(chunk_iterator, total=total_chunks, desc="Processing chunks"):
    chunk = chunk.dropna(subset=['Quote'])
    chunk = chunk.where(pd.notnull(chunk), None)
    if not chunk.empty:
        process_chunk(chunk, client, existing_quotes)

print("Done!")

Initializing...
Found 421434 existing quotes in database

Processing CSV file in chunks...


Processing chunks:   0%|          | 0/207 [00:00<?, ?it/s]

Done!


In [9]:
# Generate reduced embeddings only for quotes that don't have them
print("\nRetrieving quotes without reduced embeddings...")
with SessionLocal() as db:
    quotes = db.query(QuoteDB).filter(QuoteDB.reduced_embeddings.is_(None)).all()
print(f"Retrieved {len(quotes)} quotes.")


Retrieving quotes without reduced embeddings...
Retrieved 421491 quotes.


In [13]:
if not quotes:
    print("No quotes need reduced embeddings")
else:
    embeddings = [quote.embeddings.to_list() for quote in quotes]

    print(f"\nGenerating reduced embeddings for {len(quotes)} quotes...")
    reduced_embeddings = fit_reducer(embeddings)

    mean = np.mean(reduced_embeddings, axis=0)
    std = np.std(reduced_embeddings, axis=0)
    standardized_embeddings = (reduced_embeddings - mean) / std

    # Update quotes with reduced embeddings
    print("\nUpdating quotes with reduced embeddings...")
    update_reduced_embeddings(quotes, standardized_embeddings)

    print("\nProcessing completed!")


Generating reduced embeddings for 421491 quotes...
[I] [13:20:06.335950] Unused keyword parameter: build_kws during cuML estimator initialization

Updating quotes with reduced embeddings...


Updating database:   0%|          | 0/206 [00:00<?, ?it/s]


Processing completed!
