In [21]:

from openai import OpenAI
import numpy as np
from dotenv import load_dotenv
import ast
import pandas as pd
import datetime
import os
import glob
import praw
import requests
import json
import math
import tqdm
import ipywidgets as widgets
from IPython.display import display
import time

load_dotenv()

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"), 
)

def get_embedding(txts):
    response = client.embeddings.create(
        input=txts,
        model="text-embedding-3-small"
        )
    return response.data

# response = client.embeddings.create(
#     input="Your text string goes here",
#     model="text-embedding-3-small"
# # )

# print(response.data[0].embedding)
embeddings_dir = "embeddings/"

reddit_subs_limit = 2000
reddit_top_all_post_limit = 150
reddit_hot_post_limit = 50


In [22]:
def add_embeddings_for_posts(sub, posts, chunk_size=25):
    sub_filename = f"{embeddings_dir}{sub}.pickle"
    if not os.path.exists(sub_filename):
        existing_df = pd.DataFrame(columns=["Id", "Sub", "Title", "Embedding", "Added"])
    else:
        existing_df = pd.read_pickle(sub_filename)

    existing = set(existing_df["Id"])
    new_posts = [post for post in posts if post.id not in existing]
    for i in range(0, len(new_posts), chunk_size):
        chunk = new_posts[i:i+chunk_size]
        results = get_embedding([post.title for post in chunk])
        for post, res in zip(chunk, results):
            existing_df.loc[len(existing_df)] = [
                post.id,
                sub,
                post.title,
                res.embedding,
                datetime.datetime.now().isoformat()
            ]

    existing_df.to_pickle(sub_filename)
    return {
        "fetched": len(new_posts),
        "skipped": len(posts) - len(new_posts),
    }



def get_combined_embeddings(only_subs):
    file_list = glob.glob(f"{embeddings_dir}*.pickle")
    all_dfs = []
    for file_path in file_list:
        if len(only_subs) > 0:
            should_read = False
            for sub in only_subs:
                if file_path.endswith(f"{sub}.pickle"):
                    should_read = True
                    break
            if not should_read:
                continue
        df_temp = pd.read_pickle(file_path)
        all_dfs.append(df_temp)
    combined_df = pd.concat(all_dfs, ignore_index=True)
    # combined_df["Embedding_np"] = combined_df["Embedding"].apply(lambda x: np.array(ast.literal_eval(x)))
    return combined_df

def for_each_sub_embeddings():
    file_list = glob.glob(f"{embeddings_dir}*.pickle")
    for file_path in file_list:
        df_temp = pd.read_pickle(file_path)
        yield df_temp


In [23]:
reddit = praw.Reddit(
    client_id=os.environ.get("REDDIT_CLIENT_ID"),
    client_secret=os.environ.get("REDDIT_CLIENT_SECRET"),
    user_agent='Python:com.findthesub:fetch-script'
)

def fetch_hot_and_top_posts(sub, config = {
    'hot_count': reddit_hot_post_limit,
    'top_all_count': reddit_top_all_post_limit
}):
    max_chunk = 200
    hot_count = config['hot_count']
    top_all_count = config['top_all_count']
    subreddit = reddit.subreddit(sub)
    collected = []
    for sort, limit, kwargs in [
        ('hot', hot_count, {}),
        ('top', top_all_count, {'time_filter':'all'})
    ]:
        after = None
        fetched = 0
        while fetched < limit:
            chunk_size = min(max_chunk, limit - fetched)
            gen = getattr(subreddit, sort)(limit=chunk_size, params={'after': after}, **kwargs)
            items = list(gen)
            if not items: break
            collected.extend(items)
            fetched += len(items)
            after = items[-1].name
            if len(items) < chunk_size: break
    return collected

def get_top_subs(limit=reddit_subs_limit):
    max_chunk = 100
    collected = []
    after = None
    
    fetched = 0
    while fetched < limit:
        chunk_size = min(max_chunk, limit - fetched)
        gen = reddit.subreddits.popular(limit=chunk_size, params={'after': after})
        items = list(gen)
        if not items:
            break
        collected.extend(items)
        fetched += len(items)
        # For subreddits, the 'fullname' should be used as 'after'
        # but if that doesn't work, try items[-1].name or items[-1].id
        after = items[-1].fullname  
        if len(items) < chunk_size:
            break

    return [s.display_name for s in collected]


# Example usage:
# titles = fetch_reddit_post_titles_for_sub('python', limit=120, sort='new')
# print(titles)
result = fetch_hot_and_top_posts('python', config = {
    'hot_count': 2,
    'top_all_count': 2
})
[(x.title, x.id) for x in result]


[("Sunday Daily Thread: What's everyone working on this week?", '1hjmlmy'),
 ('Thursday Daily Thread: Python Careers, Courses, and Furthering Education!',
  '1hmc71n'),
 ("Lad wrote a Python script to download Alexa voice recordings, he didn't expect this email.",
  'g53lxf'),
 ('This post has:', 'hoolsm')]

In [25]:

progress = widgets.IntProgress(value=0, )
progress_label = widgets.Label(value=f"0/0 processed")
log_area = widgets.Textarea(
    value="",
    placeholder="Logs will appear here...",
    description="Logs",
    layout=widgets.Layout(width='100%', height='300px'),
    disabled=True,
)
display(progress_label, progress, log_area)

log_area.value += "Fetching top subreddits...\n"
subs_to_fetch = get_top_subs()[714:]
log_area.value += f"Fetched {len(subs_to_fetch)} subreddits\n\n"
progress.max = len(subs_to_fetch)
progress.value = 0
progress_label.value = f"0/{len(subs_to_fetch)} processed"


for sub in subs_to_fetch:
    progress.value += 1 
    progress_label.value = f"{progress.value}/{len(subs_to_fetch)} processed"
    
    # Update logs
    log_message = f"Processing '{sub}'...\n"
    posts = fetch_hot_and_top_posts(sub)
    log_message += f"Fetched {len(posts)} titles for {sub}\n"
    
    results = add_embeddings_for_posts(sub, posts)
    log_message += f"Added {results['fetched']} new embeddings, skipped {results['skipped']} existing\n\n"
    
    log_area.value += log_message  # Append to the textarea
    
    log_area_lines = log_area.value.split("\n")
    if len(log_area_lines) > 500:
        log_area.value = "\n".join(log_area_lines[-500:])

# Final step
# df = get_combined_embeddings(subs_to_fetch)
# df.to_pickle("embeddings.pickle")
log_area.value += "Processing complete. Saved embeddings to 'embeddings/' directory\n"

Label(value='0/0 processed')

IntProgress(value=0)

Textarea(value='', description='Logs', disabled=True, layout=Layout(height='300px', width='100%'), placeholder…

In [26]:
# df = get_combined_embeddings([])
# df.to_pickle("embeddings.pickle")

# Print numeber of embeddings
# print(f"Total embeddings: {len(df)}")

# Limit to first 1000 embeddings
# len(df)

for sub_df in for_each_sub_embeddings():
    print(sub_df["Sub"].iloc[0], len(sub_df))

artificial 200
CasualPH 200
askgaybros 200
nyc 200
granturismo 200
RocketLeague 200
LogitechG 200
distressingmemes 200
anime 200
minnesota 200
keyboards 200
TIHI 200
Tekken 200
LooksmaxingAdvice 200
StreetFighter 200
cycling 200
holdmycatnip 200
hiphopheads 200
wow 200
nutrition 200
Sneakers 200
starterpacks 200
90DayFiance 200
EASportsFC 200
mtg 200
OnePieceSpoilers 200
skeptic 200
pyrocynical 200
EDM 200
LeopardsAteMyFace 200
singapore 200
reddevils 200
askSingapore 200
rva 200
AskPhotography 200
MonsterHunterWorld 200
hvacadvice 200
Adulting 200
Pathfinder_RPG 200
FinancialPlanning 200
indonesia 200
comics 200
QuebecLibre 200
EmulationOnAndroid 200
Anxiety 200
ClashRoyale 200
Netherlands 200
CatAdvice 200
food 200
truespotify 200
wholesomeyuri 200
unrealengine 200
jailbreak 200
analog 200
GenX 200
webdev 200
Catholicism 200
FifaCareers 200
Jujutsushi 200
ProductManagement 200
touhou 200
OculusQuest 200
ik_ihe 200
teslamotors 200
BudgetAudiophile 200
SquaredCircle 200
LinkedInLunatic

KeyboardInterrupt: 

In [27]:
import os, math, json, requests
import ipywidgets as widgets
from IPython.display import display

# Create and display widgets
progress = widgets.IntProgress(value=0, description='Chunks')
progress_label = widgets.Label(value="0/0 processed")
log_area = widgets.Textarea(
    value="",
    placeholder="Logs will appear here...",
    description="Logs",
    layout=widgets.Layout(width='100%', height='300px'),
    disabled=True
)
display(progress_label, progress, log_area)

# Configuration Parameters
API_URL = f"{os.environ.get('ZILLIZ_HOST')}/v2/vectordb/entities/insert"
HEADERS = {
    "Authorization": f"Bearer {os.environ.get('ZILLIZ_API_KEY')}",
    "Accept": "application/json",
    "Content-Type": "application/json"
}
COLLECTION_NAME = "FindTheSub"
CHUNK_SIZE = 250

def send_data_to_api(df, url, headers, collection_name, chunk_size=100,
                     progress=None, progress_label=None, log_area=None):
    """
    Sends data from a DataFrame to the specified API in chunks.
    - df (pd.DataFrame): DataFrame containing 'Sub', 'Title', and 'Embedding' columns.
    - url (str): The API endpoint URL.
    - headers (dict): HTTP headers for the request.
    - collection_name (str): Name of the collection in the API.
    - chunk_size (int): Number of records to send per API request.
    - progress, progress_label, log_area: optional widgets for progress/UI logging
    """
    
    total_records = len(df)
    total_chunks = math.ceil(total_records / chunk_size)

    if log_area:
        log_area.value += f"Total records to send: {total_records}\n"
        log_area.value += f"Sending in {total_chunks} chunk(s) of up to {chunk_size} records each.\n\n"
    else:
        print(f"Total records to send: {total_records}")
        print(f"Sending in {total_chunks} chunk(s) of up to {chunk_size} records each.\n")
    
    for chunk_num in range(total_chunks):
        start_idx = chunk_num * chunk_size
        end_idx = min(start_idx + chunk_size, total_records)
        chunk_df = df.iloc[start_idx:end_idx]
        
        data_payload = []
        for idx, row in chunk_df.iterrows():
            entity = {
                "primary_key": hash(f"{row['Sub']}_{row['Title']}"),
                "post_id": row['Id'],
                "sub": row['Sub'],
                "title": row['Title'],
                "vector": row['Embedding']
            }
            data_payload.append(entity)
        
        payload = {
            "collectionName": collection_name,
            "data": data_payload
        }
        
        try:
            response = requests.post(url, data=json.dumps(payload), headers=headers)
            response.raise_for_status()
            response_data = response.json()
            
            msg = (
                f"Chunk {chunk_num + 1}/{total_chunks} (Records {start_idx} to {end_idx - 1}) sent successfully.\n"
                f"Response: {json.dumps(response_data, indent=2)[0:64]}\n\n"
            )
            if log_area:
                log_area.value += msg
            else:
                print(msg)
                
        except requests.exceptions.HTTPError as http_err:
            msg = (f"HTTP error occurred for chunk {chunk_num + 1}: {http_err}\n"
                   f"Response: {response.text}\n\n")
            if log_area:
                log_area.value += msg
            else:
                print(msg)
        except Exception as err:
            msg = f"An error occurred for chunk {chunk_num + 1}: {err}\n\n"
            if log_area:
                log_area.value += msg
            else:
                print(msg)


# Example Usage (assuming df is your DataFrame):
progress.value = 0
progress.max = len(subs_to_fetch)
progress_label.value = f"0/{len(subs_to_fetch)} processed"
for sub_df in for_each_sub_embeddings():
    log_area.value += f"Sending data for '{sub_df['Sub'].iloc[0]}' to the API...\n"
    send_data_to_api(sub_df, API_URL, HEADERS, COLLECTION_NAME, CHUNK_SIZE, log_area=log_area)
    progress.value += 1
    progress_label.value = f"{progress.value}/{progress.max} processed"

Label(value='0/0 processed')

IntProgress(value=0, description='Chunks')

Textarea(value='', description='Logs', disabled=True, layout=Layout(height='300px', width='100%'), placeholder…

In [16]:
# Read the CSV file
df = pd.read_pickle('embeddings.pickle')

# Replace "Embedding" with "Embedding_np"
# df["Embedding_np"] = df["Embedding"].apply(lambda x: np.array(ast.literal_eval(x)))
# df = df.drop(columns=["Embedding"])  # Drop the original "Embedding" column
# df.rename(columns={"Embedding_np": "Embedding"}, inplace=True)  # Rename "Embedding_np" to "Embedding"

# # Write back to a Parquet file
# df.to_pickle('server/embeddings.pickle')



def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def get_closest_subs(df, txt, n=10):
    embedding = np.array(get_embedding([txt])[0].embedding)
    df['similarities'] = df.Embedding.apply(lambda x: cosine_similarity(x, embedding))
    # Aggregate similarities by sub, then return top n subs
    grouped = df.groupby('Sub')['similarities'].mean().reset_index()
    return grouped.sort_values('similarities', ascending=False).head(n)

df.loc[0]

Sub                                                 artificial
Title                                        AI has hit a wall
Embedding    [-0.01548602432012558, -0.010631220415234566, ...
Added                               2024-12-24T17:35:03.596191
Name: 0, dtype: object

In [None]:
# Notebook Cell

SEARCH_API_URL = f"{os.environ.get("ZILLIZ_HOST")}/v2/vectordb/entities/search"
SEARCH_HEADERS = {
    "Authorization": f"Bearer {os.environ.get('ZILLIZ_API_KEY')}",
    "Accept": "application/json",
    "Content-Type": "application/json"
}
COLLECTION_NAME = "FindTheSub"
LIMIT = 10
TOP_N = 5

def query_closest_subs(txt, limit=LIMIT, top_n=TOP_N):
    embeddings = get_embedding([txt])
    if not embeddings: raise ValueError("Failed to obtain embedding for the input text.")
    payload = {
        "collectionName": COLLECTION_NAME,
        "data": [embeddings[0].embedding],
        "limit": limit,
        "outputFields": ["sub"]
    }
    r = requests.post(SEARCH_API_URL, json.dumps(payload), headers=SEARCH_HEADERS)
    r.raise_for_status()
    df = pd.DataFrame(r.json().get('data', []))
    if df.empty: 
        return df
    grouped = df.groupby('sub')['distance'].mean().reset_index()
    return grouped.sort_values('distance', ascending=False).head(top_n)

result = query_closest_subs("Why did I get laid off before the holidays?", limit=1000)
result

Unnamed: 0,sub,distance
12,askSingapore,0.163058
17,mtg,0.148387
1,Adulting,0.14742
19,nyc,0.143336
3,LeopardsAteMyFace,0.142655
