In [19]:
import os
import re
from datetime import datetime

PROVIDERS = {
    "google": {
        "raw": "Takeout/My Activity/Search",
        "parsed": "google/search_history",
        "summary": "google/search_history_summary",
        "context": ""
    }
}


def get_filenames(
    kind="parsed", start_date=None, end_date=None, provider="google"
):
    directory = os.path.join("..", "_data", kind, PROVIDERS[provider][kind])
    if start_date is not None:
        start_date = datetime.strptime(start_date, "%Y-%m-%d")
    if end_date is not None:
        end_date = datetime.strptime(end_date, "%Y-%m-%d")
    file_pattern = r"^(\d{4}-\d{2}-\d{2})\.csv$"

    def is_date_in_range(file_date):
        if start_date is None and end_date is None:
            return True
        else:
            return start_date <= datetime.strptime(file_date, "%Y-%m-%d") <= end_date

    filenames = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            match = re.match(file_pattern, file)
            if match:
                file_date = match.groups()[0]
                if is_date_in_range(file_date):
                    filenames.append(os.path.join(root, file))

    return filenames

In [20]:
from openai import OpenAI
import os
import pandas as pd

In [21]:
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [22]:
openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def create_embeddings(inputs):
    try:
        embeddings_batch_response = openai_client.embeddings.create(
            model="text-embedding-3-small",
            input=inputs,
            encoding_format="float"
        )
    
        return list(map(lambda x: x.embedding, embeddings_batch_response.data))
    except Exception as e:
        print(e)
        return []


In [23]:
from openai import AsyncOpenAI
import httpx

custom_client = AsyncOpenAI(
  http_client=httpx.AsyncClient(
    limits=httpx.Limits(
      max_connections=256,
      max_keepalive_connections=256
    )
  ),
  base_url="https://miqeokznd89bv1-8000.proxy.runpod.net/v1"
)

no_match = 0

async def summarize_interests(prompt):
  try:
    answer1 = await custom_client.chat.completions.create(
      model="mistralai/Mistral-7B-Instruct-v0.2",
      messages=[
        {"role": "user", "content": "What interests can you find in the following search records? \n"+prompt},
      ]
    )

    answer2 = await custom_client.chat.completions.create(
      model="mistralai/Mistral-7B-Instruct-v0.2",
      messages=[
        {"role": "user", "content": "What are the main interests in the following search records? \n"+prompt}, 
        {"role": "assistant", "content": answer1.choices[0].message.content},
        {"role": "user", "content": "Summarize the previous answer as an array of strings."},
      ]
    )

    raw = answer2.choices[0].message.content

    match = re.search(r'\[(.*?)\]', raw)
    if match:
        # If a match is found, split the substring by comma
        return match.group(1).replace("\"", "").replace("'","").split(",")
    else:
        global no_match
        no_match += 1
        return []

  except Exception as e:
    print(e)
    return []
  

In [24]:
from collections import defaultdict
import numpy as np
from tqdm.asyncio import tqdm_asyncio


chunk_size = 35

interests = defaultdict(list)
tasks_dict = defaultdict(list)

for filename in get_filenames():
    df = pd.read_csv(filename)
    date = filename.split("/")[-1].split(".")[0]

    if os.path.exists(f"../_data/embeddings/{date}.npy"):
        continue

    inputs = df["title"].tolist()

    for i in range(0, len(inputs), chunk_size):
        tasks_dict[date].append(summarize_interests("\n".join(inputs[i:i+chunk_size])))

all_tasks = [task for tasks in tasks_dict.values() for task in tasks]

wrapped_tasks = []

async def wrap_task_with_date(date, task):
    result = await task
    return (date, result)
    
for date, tasks in tasks_dict.items():
    wrapped_tasks.extend([wrap_task_with_date(date, task) for task in tasks])

# Await all wrapped tasks

import json

results_dict = defaultdict(list)
with await tqdm_asyncio.gather(*wrapped_tasks, smoothing=0) as results:
    results.set_postfix(no_match=no_match)
    for date, result in results:

        results_dict[date].extend(result)

        json.dump(results_dict[date], open(f"../_data/interests/{date}.json", "w"))




  0%|          | 0/4315 [00:33<?, ?it/s]


CancelledError: 

Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error code: 404
Error co

In [None]:
# for i, answer in enumerate(await tqdm_asyncio.gather(*all_tasks, smoothing=0)):


#     embeddings = create_embeddings(interests)
    
#     if embeddings == []:
#         continue

#     np.save(f"../_data/embeddings/{date}.npy", embeddings)