In [24]:

from openai import OpenAI
import numpy as np
from dotenv import load_dotenv
import ast
import pandas as pd
import datetime
import os
import glob
import praw
import tqdm
import ipywidgets as widgets
from IPython.display import display
import time

load_dotenv()

client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"), 
)

def get_embedding(txts):
    response = client.embeddings.create(
        input=txts,
        model="text-embedding-3-small"
        )
    return response.data

# response = client.embeddings.create(
#     input="Your text string goes here",
#     model="text-embedding-3-small"
# # )

# print(response.data[0].embedding)
embeddings_dir = "embeddings/"

reddit_subs_limit = 1200
reddit_top_all_post_limit = 20
reddit_hot_post_limit = 20


In [25]:
def add_embeddings_for_titles(sub, titles, chunk_size=25):
    sub_csv_filename = f"{embeddings_dir}{sub}.pickle"
    if not os.path.exists(sub_csv_filename):
        existing_df = pd.DataFrame(columns=["Sub", "Title", "Embedding", "Added"])
    else:
        existing_df = pd.read_pickle(sub_csv_filename)

    existing = set(zip(existing_df["Sub"], existing_df["Title"]))
    new_titles = [t for t in titles if (sub, t) not in existing]
    for i in range(0, len(new_titles), chunk_size):
        chunk = new_titles[i:i+chunk_size]
        results = get_embedding(chunk)
        for title, res in zip(chunk, results):
            existing_df.loc[len(existing_df)] = [
                sub,
                title,
                res.embedding,
                datetime.datetime.now().isoformat()
            ]

    existing_df.to_pickle(sub_csv_filename)
    return {
        "fetched": len(new_titles),
        "skipped": len(titles) - len(new_titles),
    }



def get_combined_embeddings(only_subs):
    file_list = glob.glob(f"{embeddings_dir}*.pickle")
    all_dfs = []
    for file_path in file_list:
        if len(only_subs) > 0:
            should_read = False
            for sub in only_subs:
                if file_path.endswith(f"{sub}.pickle"):
                    should_read = True
                    break
            if not should_read:
                continue
        df_temp = pd.read_pickle(file_path)
        all_dfs.append(df_temp)
    combined_df = pd.concat(all_dfs, ignore_index=True)
    # combined_df["Embedding_np"] = combined_df["Embedding"].apply(lambda x: np.array(ast.literal_eval(x)))
    return combined_df


In [27]:
reddit = praw.Reddit(
    client_id=os.environ.get("REDDIT_CLIENT_ID"),
    client_secret=os.environ.get("REDDIT_CLIENT_SECRET"),
    user_agent='Python:com.findthesub:fetch-script'
)

def fetch_hot_and_top_posts(sub, config = {
    'hot_count': reddit_hot_post_limit,
    'top_all_count': reddit_top_all_post_limit
}):
    max_chunk = 100
    hot_count = config['hot_count']
    top_all_count = config['top_all_count']
    subreddit = reddit.subreddit(sub)
    collected = []
    for sort, limit, kwargs in [
        ('hot', hot_count, {}),
        ('top', top_all_count, {'time_filter':'all'})
    ]:
        after = None
        fetched = 0
        while fetched < limit:
            chunk_size = min(max_chunk, limit - fetched)
            gen = getattr(subreddit, sort)(limit=chunk_size, params={'after': after}, **kwargs)
            items = list(gen)
            if not items: break
            collected.extend(items)
            fetched += len(items)
            after = items[-1].name
            if len(items) < chunk_size: break
    return [p.title for p in collected]

    return [p.title for p in collected]

def get_top_subs(limit=reddit_subs_limit):
    max_chunk = 100
    collected = []
    after = None
    
    fetched = 0
    while fetched < limit:
        chunk_size = min(max_chunk, limit - fetched)
        gen = reddit.subreddits.popular(limit=chunk_size, params={'after': after})
        items = list(gen)
        if not items:
            break
        collected.extend(items)
        fetched += len(items)
        # For subreddits, the 'fullname' should be used as 'after'
        # but if that doesn't work, try items[-1].name or items[-1].id
        after = items[-1].fullname  
        if len(items) < chunk_size:
            break

    return [s.display_name for s in collected]


# Example usage:
# titles = fetch_reddit_post_titles_for_sub('python', limit=120, sort='new')
# print(titles)


In [28]:

progress = widgets.IntProgress(value=0, )
progress_label = widgets.Label(value=f"0/0 processed")
log_area = widgets.Textarea(
    value="",
    placeholder="Logs will appear here...",
    description="Logs",
    layout=widgets.Layout(width='100%', height='300px'),
    disabled=True,
)
display(progress_label, progress, log_area)

log_area.value += "Fetching top subreddits...\n"
subs_to_fetch = get_top_subs()
log_area.value += f"Fetched {len(subs_to_fetch)} subreddits\n\n"
progress.max = len(subs_to_fetch)
progress.value = 0
progress_label.value = f"0/{len(subs_to_fetch)} processed"


for sub in subs_to_fetch:
    progress.value += 1 
    progress_label.value = f"{progress.value}/{len(subs_to_fetch)} processed"
    
    # Update logs
    log_message = f"Processing '{sub}'...\n"
    titles = fetch_hot_and_top_posts(sub)
    log_message += f"Fetched {len(titles)} titles for {sub}\n"
    
    results = add_embeddings_for_titles(sub, titles)
    log_message += f"Added {results['fetched']} new embeddings, skipped {results['skipped']} existing\n\n"
    
    log_area.value += log_message  # Append to the textarea
    
    log_area_lines = log_area.value.split("\n")
    if len(log_area_lines) > 500:
        log_area.value = "\n".join(log_area_lines[-500:])

# Final step
df = get_combined_embeddings(subs_to_fetch)
df.to_pickle("embeddings.pickle")
log_area.value += "Processing complete. Embeddings saved to 'embeddings.pickle'.\n"

Label(value='0/0 processed')

IntProgress(value=0)

Textarea(value='', description='Logs', disabled=True, layout=Layout(height='300px', width='100%'), placeholder…

In [29]:
# Read the CSV file
df = pd.read_pickle('server/embeddings.pickle')

# Replace "Embedding" with "Embedding_np"
# df["Embedding_np"] = df["Embedding"].apply(lambda x: np.array(ast.literal_eval(x)))
# df = df.drop(columns=["Embedding"])  # Drop the original "Embedding" column
# df.rename(columns={"Embedding_np": "Embedding"}, inplace=True)  # Rename "Embedding_np" to "Embedding"

# # Write back to a Parquet file
# df.to_pickle('server/embeddings.pickle')



def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def get_closest_subs(df, txt, n=10):
    embedding = np.array(get_embedding([txt])[0].embedding)
    df['similarities'] = df.Embedding.apply(lambda x: cosine_similarity(x, embedding))
    # Aggregate similarities by sub, then return top n subs
    grouped = df.groupby('Sub')['similarities'].mean().reset_index()
    return grouped.sort_values('similarities', ascending=False).head(n)

df.loc[0]

Sub                                                 grandorder
Title           [Help and Question Thread] - December 22, 2024
Added                               2024-12-22T15:08:53.756286
Embedding    [0.028386393561959267, 0.004223015159368515, -...
Name: 0, dtype: object

In [34]:
subs = get_closest_subs(df, 'Latest leaks show crazy realism')
subs
# subs.to_json(orient='records')
# subs.to_dict(orient='records')


Unnamed: 0,Sub,similarities
190,GamingLeaksAndRumours,0.303896
116,CrackWatch,0.280385
498,anime_irl,0.271308
460,Unity3D,0.259348
188,Games,0.253797
169,FiftyFifty,0.253734
337,PS5,0.252305
485,XboxSeriesX,0.246258
972,virtualreality,0.24482
224,HonkaiStarRail_leaks,0.243416
