In [29]:

from openai import OpenAI
import numpy as np
from dotenv import load_dotenv
import ast
import pandas as pd
import datetime
import os
import glob
import praw
import requests
import json
import math
import tqdm
import ipywidgets as widgets
from IPython.display import display
import time

load_dotenv()

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"), 
)

def get_embedding(txts):
    response = client.embeddings.create(
        input=txts,
        model="text-embedding-3-small"
        )
    return response.data

# response = client.embeddings.create(
#     input="Your text string goes here",
#     model="text-embedding-3-small"
# # )

# print(response.data[0].embedding)
embeddings_dir = "embeddings/"

reddit_subs_limit = 1200
reddit_top_all_post_limit = 20
reddit_hot_post_limit = 20


In [20]:
def add_embeddings_for_titles(sub, titles, chunk_size=25):
    sub_csv_filename = f"{embeddings_dir}{sub}.pickle"
    if not os.path.exists(sub_csv_filename):
        existing_df = pd.DataFrame(columns=["Sub", "Title", "Embedding", "Added"])
    else:
        existing_df = pd.read_pickle(sub_csv_filename)

    existing = set(zip(existing_df["Sub"], existing_df["Title"]))
    new_titles = [t for t in titles if (sub, t) not in existing]
    for i in range(0, len(new_titles), chunk_size):
        chunk = new_titles[i:i+chunk_size]
        results = get_embedding(chunk)
        for title, res in zip(chunk, results):
            existing_df.loc[len(existing_df)] = [
                sub,
                title,
                res.embedding,
                datetime.datetime.now().isoformat()
            ]

    existing_df.to_pickle(sub_csv_filename)
    return {
        "fetched": len(new_titles),
        "skipped": len(titles) - len(new_titles),
    }



def get_combined_embeddings(only_subs):
    file_list = glob.glob(f"{embeddings_dir}*.pickle")
    all_dfs = []
    for file_path in file_list:
        if len(only_subs) > 0:
            should_read = False
            for sub in only_subs:
                if file_path.endswith(f"{sub}.pickle"):
                    should_read = True
                    break
            if not should_read:
                continue
        df_temp = pd.read_pickle(file_path)
        all_dfs.append(df_temp)
    combined_df = pd.concat(all_dfs, ignore_index=True)
    # combined_df["Embedding_np"] = combined_df["Embedding"].apply(lambda x: np.array(ast.literal_eval(x)))
    return combined_df


In [21]:
reddit = praw.Reddit(
    client_id=os.environ.get("REDDIT_CLIENT_ID"),
    client_secret=os.environ.get("REDDIT_CLIENT_SECRET"),
    user_agent='Python:com.findthesub:fetch-script'
)

def fetch_hot_and_top_posts(sub, config = {
    'hot_count': reddit_hot_post_limit,
    'top_all_count': reddit_top_all_post_limit
}):
    max_chunk = 100
    hot_count = config['hot_count']
    top_all_count = config['top_all_count']
    subreddit = reddit.subreddit(sub)
    collected = []
    for sort, limit, kwargs in [
        ('hot', hot_count, {}),
        ('top', top_all_count, {'time_filter':'all'})
    ]:
        after = None
        fetched = 0
        while fetched < limit:
            chunk_size = min(max_chunk, limit - fetched)
            gen = getattr(subreddit, sort)(limit=chunk_size, params={'after': after}, **kwargs)
            items = list(gen)
            if not items: break
            collected.extend(items)
            fetched += len(items)
            after = items[-1].name
            if len(items) < chunk_size: break
    return [p.title for p in collected]

    return [p.title for p in collected]

def get_top_subs(limit=reddit_subs_limit):
    max_chunk = 100
    collected = []
    after = None
    
    fetched = 0
    while fetched < limit:
        chunk_size = min(max_chunk, limit - fetched)
        gen = reddit.subreddits.popular(limit=chunk_size, params={'after': after})
        items = list(gen)
        if not items:
            break
        collected.extend(items)
        fetched += len(items)
        # For subreddits, the 'fullname' should be used as 'after'
        # but if that doesn't work, try items[-1].name or items[-1].id
        after = items[-1].fullname  
        if len(items) < chunk_size:
            break

    return [s.display_name for s in collected]


# Example usage:
# titles = fetch_reddit_post_titles_for_sub('python', limit=120, sort='new')
# print(titles)


In [28]:

progress = widgets.IntProgress(value=0, )
progress_label = widgets.Label(value=f"0/0 processed")
log_area = widgets.Textarea(
    value="",
    placeholder="Logs will appear here...",
    description="Logs",
    layout=widgets.Layout(width='100%', height='300px'),
    disabled=True,
)
display(progress_label, progress, log_area)

log_area.value += "Fetching top subreddits...\n"
subs_to_fetch = get_top_subs()
log_area.value += f"Fetched {len(subs_to_fetch)} subreddits\n\n"
progress.max = len(subs_to_fetch)
progress.value = 0
progress_label.value = f"0/{len(subs_to_fetch)} processed"


for sub in subs_to_fetch:
    progress.value += 1 
    progress_label.value = f"{progress.value}/{len(subs_to_fetch)} processed"
    
    # Update logs
    log_message = f"Processing '{sub}'...\n"
    titles = fetch_hot_and_top_posts(sub)
    log_message += f"Fetched {len(titles)} titles for {sub}\n"
    
    results = add_embeddings_for_titles(sub, titles)
    log_message += f"Added {results['fetched']} new embeddings, skipped {results['skipped']} existing\n\n"
    
    log_area.value += log_message  # Append to the textarea
    
    log_area_lines = log_area.value.split("\n")
    if len(log_area_lines) > 500:
        log_area.value = "\n".join(log_area_lines[-500:])

# Final step
df = get_combined_embeddings(subs_to_fetch)
df.to_pickle("embeddings.pickle")
log_area.value += "Processing complete. Embeddings saved to 'embeddings.pickle'.\n"

Label(value='0/0 processed')

IntProgress(value=0)

Textarea(value='', description='Logs', disabled=True, layout=Layout(height='300px', width='100%'), placeholder…

In [42]:
df = get_combined_embeddings([])
df.to_pickle("embeddings.pickle")

# Print numeber of embeddings
print(f"Total embeddings: {len(df)}")

# Limit to first 1000 embeddings
len(df)

Total embeddings: 48000


48000

In [None]:


# Configuration Parameters
API_URL = f"{os.environ.get("ZILLIZ_HOST")}/v2/vectordb/entities/insert"
HEADERS = {
    "Authorization": f"Bearer {os.environ.get('ZILLIZ_API_KEY')}",
    "Accept": "application/json",
    "Content-Type": "application/json"
}
COLLECTION_NAME = "FindTheSub"
CHUNK_SIZE = 250  # Set your desired chunk size here

def send_data_to_api(df, url, headers, collection_name, chunk_size=100):
    """
    Sends data from a DataFrame to the specified API in chunks.

    Parameters:
    - df (pd.DataFrame): DataFrame containing 'Sub', 'Title', and 'Embedding' columns.
    - url (str): The API endpoint URL.
    - headers (dict): HTTP headers for the request.
    - collection_name (str): Name of the collection in the API.
    - chunk_size (int): Number of records to send per API request.
    """
    
    total_records = len(df)
    total_chunks = math.ceil(total_records / chunk_size)
    
    print(f"Total records to send: {total_records}")
    print(f"Sending in {total_chunks} chunk(s) of up to {chunk_size} records each.\n")
    
    for chunk_num in range(total_chunks):
        start_idx = chunk_num * chunk_size
        end_idx = min(start_idx + chunk_size, total_records)
        chunk_df = df.iloc[start_idx:end_idx]
        
        # Prepare the data payload
        data_payload = []
        for idx, row in chunk_df.iterrows():
            entity = {
                # Calculate integer primary key based on sub + title
                "primary_key": hash(f"{row['Sub']}_{row['Title']}"),
                "sub": row['Sub'],
                "title": row['Title'],
                "vector": row['Embedding']
            }
            data_payload.append(entity)
        
        payload = {
            "collectionName": collection_name,
            "data": data_payload
        }
        
        try:
            response = requests.post(url, data=json.dumps(payload), headers=headers)
            response.raise_for_status()  # Raise an exception for HTTP errors
            response_data = response.json()
            print(f"Chunk {chunk_num + 1}/{total_chunks} (Records {start_idx} to {end_idx - 1}) sent successfully.")
            print(f"Response: {json.dumps(response_data, indent=2)[0:64]}\n")
        except requests.exceptions.HTTPError as http_err:
            print(f"HTTP error occurred for chunk {chunk_num + 1}: {http_err}")
            print(f"Response: {response.text}\n")
        except Exception as err:
            print(f"An error occurred for chunk {chunk_num + 1}: {err}\n")

# Example Usage:
# Assuming 'df' is your DataFrame with 'Sub', 'Title', and 'Embedding' columns
# Make sure 'Embedding' is a list of numerical values for each row

send_data_to_api(df, API_URL, HEADERS, COLLECTION_NAME, CHUNK_SIZE)

Total records to send: 48000
Sending in 192 chunk(s) of up to 250 records each.

Chunk 1/192 (Records 0 to 249) sent successfully.
Response: {
  "code": 0,
  "cost": 380,
  "data": {
    "insertCount": 250

Chunk 2/192 (Records 250 to 499) sent successfully.
Response: {
  "code": 0,
  "cost": 380,
  "data": {
    "insertCount": 250

Chunk 3/192 (Records 500 to 749) sent successfully.
Response: {
  "code": 0,
  "cost": 380,
  "data": {
    "insertCount": 250

Chunk 4/192 (Records 750 to 999) sent successfully.
Response: {
  "code": 0,
  "cost": 381,
  "data": {
    "insertCount": 250

Chunk 5/192 (Records 1000 to 1249) sent successfully.
Response: {
  "code": 0,
  "cost": 380,
  "data": {
    "insertCount": 250

Chunk 6/192 (Records 1250 to 1499) sent successfully.
Response: {
  "code": 0,
  "cost": 381,
  "data": {
    "insertCount": 250

Chunk 7/192 (Records 1500 to 1749) sent successfully.
Response: {
  "code": 0,
  "cost": 382,
  "data": {
    "insertCount": 250

Chunk 8/192 (Record

In [16]:
# Read the CSV file
df = pd.read_pickle('embeddings.pickle')

# Replace "Embedding" with "Embedding_np"
# df["Embedding_np"] = df["Embedding"].apply(lambda x: np.array(ast.literal_eval(x)))
# df = df.drop(columns=["Embedding"])  # Drop the original "Embedding" column
# df.rename(columns={"Embedding_np": "Embedding"}, inplace=True)  # Rename "Embedding_np" to "Embedding"

# # Write back to a Parquet file
# df.to_pickle('server/embeddings.pickle')



def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def get_closest_subs(df, txt, n=10):
    embedding = np.array(get_embedding([txt])[0].embedding)
    df['similarities'] = df.Embedding.apply(lambda x: cosine_similarity(x, embedding))
    # Aggregate similarities by sub, then return top n subs
    grouped = df.groupby('Sub')['similarities'].mean().reset_index()
    return grouped.sort_values('similarities', ascending=False).head(n)

df.loc[0]

Sub                                                 artificial
Title                                        AI has hit a wall
Embedding    [-0.01548602432012558, -0.010631220415234566, ...
Added                               2024-12-24T17:35:03.596191
Name: 0, dtype: object

In [None]:
# Notebook Cell

SEARCH_API_URL = f"{os.environ.get("ZILLIZ_HOST")}/v2/vectordb/entities/search"
SEARCH_HEADERS = {
    "Authorization": f"Bearer {os.environ.get('ZILLIZ_API_KEY')}",
    "Accept": "application/json",
    "Content-Type": "application/json"
}
COLLECTION_NAME = "FindTheSub"
LIMIT = 10
TOP_N = 5

def query_closest_subs(txt, limit=LIMIT, top_n=TOP_N):
    embeddings = get_embedding([txt])
    if not embeddings: raise ValueError("Failed to obtain embedding for the input text.")
    payload = {
        "collectionName": COLLECTION_NAME,
        "data": [embeddings[0].embedding],
        "limit": limit,
        "outputFields": ["sub"]
    }
    r = requests.post(SEARCH_API_URL, json.dumps(payload), headers=SEARCH_HEADERS)
    r.raise_for_status()
    df = pd.DataFrame(r.json().get('data', []))
    if df.empty: 
        return df
    grouped = df.groupby('sub')['distance'].mean().reset_index()
    return grouped.sort_values('distance', ascending=False).head(top_n)

result = query_closest_subs("Why did I get laid off before the holidays?", limit=1000)
result

Unnamed: 0,sub,distance
12,askSingapore,0.163058
17,mtg,0.148387
1,Adulting,0.14742
19,nyc,0.143336
3,LeopardsAteMyFace,0.142655
