# interests extraction

In [None]:
from openai import AsyncOpenAI
import httpx

MAX_CONCURRENCY=x

custom_client = AsyncOpenAI(
  http_client=httpx.AsyncClient(
    limits=httpx.Limits(
      max_connections=MAX_CONCURRENCY,
      max_keepalive_connections=MAX_CONCURRENCY
    ),
    timeout=60*10
  ),
  base_url="https://wao06rxq3acms1-8000.proxy.runpod.net/v1"
)

no_match = 0
errors = 0

async def summarize_interests(data,prompt_1, prompt_2):
  global custom_client

  summarization_prompt = prompt_1
  parsing_prompt = prompt_2

  try:
    answer1 = await custom_client.chat.completions.create(
      model="mistralai/Mistral-7B-Instruct-v0.2",
      messages=[
        {"role": "user", "content": summarization_prompt+data},
      ]
    )

    answer2 = await custom_client.chat.completions.create(
      model="mistralai/Mistral-7B-Instruct-v0.2",
      messages=[
        {"role": "user", "content": summarization_prompt+data}, 
        {"role": "assistant", "content": answer1.choices[0].message.content},
        {"role": "user", "content": parsing_prompt},
      ]
    )

    raw = answer2.choices[0].message.content

    # Use regex to extract the content between the square brackets
    # We do this to get a python list from the string
    match = re.search(r'\[(.*?)\]', raw)
    if match:
        # If a match is found, split the substring by comma
        return match.group(1).replace("\"", "").replace("'","").split(",")
    else:
        global no_match
        no_match += 1
        return []

  except Exception as e:
    global errors
    errors += 1
    return []
  
# From every day, we get maximum 35 records of raw data to call the LLM with

from collections import defaultdict
from tqdm.asyncio import tqdm_asyncio


chunk_size = 35

interests = defaultdict(list)
tasks_dict = defaultdict(list)

for filename in get_filenames():
    df = pd.read_csv(filename)
    date = filename.split("/")[-1].split(".")[0]

    inputs = df["title"].tolist()

    for i in range(0, len(inputs), chunk_size):
        # TODO: change this code to use the two prompt sets form the Google doc
        tasks_dict[date].append(summarize_interests("\n".join(inputs[i:i+chunk_size])))


# Code to run the tasks in parallel, while keeping the date information for each task

from asyncio import Semaphore

wrapped_tasks = []

async def wrap_task_with_date(sem, date, t):
    async with sem:
        result = await t
        return (date, result)

sem = Semaphore(MAX_CONCURRENCY)
for date, tasks in tasks_dict.items():
    wrapped_tasks.extend([wrap_task_with_date(sem, date, task) for task in tasks])

import json

results_dict = defaultdict(list)
results = await tqdm_asyncio.gather(*wrapped_tasks, smoothing=0)
    
for date, result in results:
    results_dict[date].extend(result)
    json.dump(results_dict[date], open(f"../_data/interests/{date}.json", "w"))

# TODO: separate the results form prompt set A and B

df["interests_a"] = interests_a
df["interests_b"] = interests_b

# embeddings

In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("Salesforce/SFR-Embedding-Mistral")
embeddings = []

# TODO: run this separately for the results of each of the two prompt sets

for interest in df["interests_a"]:
    embeddings.append(model.encode(interest))

df["embeddings_a"] = embeddings
df["embeddings_b"] = ...

# clustering

In [None]:
from cuml.metrics import pairwise_distances
from hdbscan import HDBSCAN
import numpy as np
import cupy as cp  
import cuml

embeddings_gpu = cp.asarray(df["embeddings_a"].to_list())

umap_model = cuml.UMAP(n_neighbors=15,
                       n_components=100, 
                       min_dist=0.1, 
                       metric='cosine')
reduced_data_gpu = umap_model.fit_transform(embeddings_gpu)

cosine_dist = pairwise_distances(reduced_data_gpu, metric='cosine')

clusterer = HDBSCAN(min_cluster_size=5, 
                    gen_min_span_tree=True,
                    metric="precomputed",
                    cluster_selection_epsilon=0.02) 
cluster_labels = clusterer.fit_predict(cosine_dist.astype(np.float64).get())

df["clusters_a"] = cluster_labels
df["clusters_b"] = ...

In [None]:
# get top clusters 

In [None]:
TODO