In [3]:
import os
import re
from datetime import datetime

PROVIDERS = {
    "google": {
        "raw": "Takeout/My Activity/Search",
        "parsed": "google/search_history",
        "summary": "google/search_history_summary",
        "context": ""
    }
}


def get_filenames(
    kind="parsed", start_date=None, end_date=None, provider="google"
):
    directory = os.path.join("..", "_data", kind, PROVIDERS[provider][kind])
    if start_date is not None:
        start_date = datetime.strptime(start_date, "%Y-%m-%d")
    if end_date is not None:
        end_date = datetime.strptime(end_date, "%Y-%m-%d")
    file_pattern = r"^(\d{4}-\d{2}-\d{2})\.csv$"

    def is_date_in_range(file_date):
        if start_date is None and end_date is None:
            return True
        else:
            return start_date <= datetime.strptime(file_date, "%Y-%m-%d") <= end_date

    filenames = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            match = re.match(file_pattern, file)
            if match:
                file_date = match.groups()[0]
                if is_date_in_range(file_date):
                    filenames.append(os.path.join(root, file))

    return filenames

In [7]:
import os
import pandas as pd

In [8]:
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [9]:
from openai import AsyncOpenAI
import httpx

custom_client = AsyncOpenAI(
    http_client=httpx.AsyncClient(
        limits=httpx.Limits(max_connections=256, max_keepalive_connections=256)
    ),
)


async def create_embeddings(inputs):
    try:
        embeddings_batch_response = await custom_client.embeddings.create(
            model="text-embedding-3-small", input=inputs, encoding_format="float"
        )

        return list(map(lambda x: x.embedding, embeddings_batch_response.data))
    except Exception as e:
        print(e)
        return None

In [None]:
from collections import defaultdict
import numpy as np
from tqdm.asyncio import tqdm_asyncio

tasks_dict = defaultdict(list)

for filename in get_filenames():
    df = pd.read_csv(filename)
    date = filename.split("/")[-1].split(".")[0]

    if os.path.exists(f"../_data/embeddings/{date}.npy"):
        continue

    # remove duplicates
    df = df.drop_duplicates(subset="title")

    df = df[~df["title"].str.contains("Visited ")]
    df = df[~df["title"].str.contains("Used ")]
    df = df[~df["title"].str.contains("Defined ")]

    df["title"] = df["title"].str.replace("Searched for ", "")

    inputs = df["title"].tolist()

    tasks_dict[date].append(create_embeddings(inputs))


all_tasks = [task for tasks in tasks_dict.values() for task in tasks]

wrapped_tasks = []

async def wrap_task_with_date(date, task):
    result = await task
    return (date, result)
    
for date, tasks in tasks_dict.items():
    wrapped_tasks.extend([wrap_task_with_date(date, task) for task in tasks])


for date, embeddings in await tqdm_asyncio.gather(*wrapped_tasks, smoothing=0):
    if embeddings is None:
        continue

    np.save(f"../_data/embeddings/{date}.npy", embeddings)

In [5]:
# load all parsed data into a single dataframe
import pandas as pd
import numpy as np

df = pd.DataFrame(columns=["date", "title", "embedding"])

for filename in get_filenames():
    date = filename.split("/")[-1].split(".")[0]

    try:
        embeddings = np.load(f"../_data/embeddings/{date}.npy")

        tmp_df = pd.read_csv(filename)
        tmp_df = tmp_df.drop_duplicates(subset="title")

        tmp_df = tmp_df[~tmp_df["title"].str.contains("Visited ")]
        tmp_df = tmp_df[~tmp_df["title"].str.contains("Used ")]
        tmp_df = tmp_df[~tmp_df["title"].str.contains("Defined ")]

        tmp_df["title"] = tmp_df["title"].str.replace("Searched for ", "")

        tmp_df["date"] = date  
        tmp_df["embedding"] = embeddings.tolist()

        df = pd.concat([df, tmp_df])
    except Exception as e:
        print(e)
        pass

In [6]:
df

Unnamed: 0,date,title,embedding,hour
59,2018-12-19,"hotels in Bibione, Metropolitan City of Venice","[-0.039552055, -0.019501107, 0.027437177, 0.00...",15:36
14,2018-12-23,"hotels in Trieste, Province of Trieste","[-0.083720826, -0.007642641, 0.00838033, -0.01...",14:08
108,2019-01-10,"flights from Frankfurt to ? on Jan 26, 2019 re...","[-0.05184426, 0.007946541, 0.025223108, -0.014...",18:15
5,2019-01-29,"flights from Frankfurt to ? on Feb 14, 2019 re...","[-0.07242466, 0.017469313, 0.01747911, -0.0269...",19:36
19,2019-01-30,"flights from Frankfurt to ? on Feb 15, 2019 re...","[-0.06785829, 0.034601793, 0.0010219548, -0.02...",19:17
...,...,...,...,...
123,2023-08-17,dataswift,"[-0.01469968, -0.005647114, 0.014277903, -0.00...",12:02
124,2023-08-17,Viewed DataSwift Network Services Ltd.,"[-0.018999442, -0.0025215521, -0.020922575, -0...",12:02
125,2023-08-17,Dataswyft crunchbase,"[-0.019194867, -0.019770563, 0.027603038, 0.04...",12:09
126,2023-08-17,opt in advertising,"[0.012959989, 0.029413605, -0.013890605, 0.042...",12:18


In [7]:
import numpy as np
import cudf
import cuml

df_gpu = cudf.DataFrame.from_pandas(df)

embeddings_gpu = df_gpu["embedding"].tolist()

# UMAP for dimensionality reduction
umap_model = cuml.UMAP(n_neighbors=15,
                       n_components=100, 
                       min_dist=0.1, 
                       metric='cosine')
reduced_data_gpu = umap_model.fit_transform(embeddings_gpu)

# HDBSCAN for clustering
clusterer = cuml.cluster.HDBSCAN(min_cluster_size=5, # minimum size of clusters
                                 gen_min_span_tree=True) # useful for visualization, if supported
cluster_labels_gpu = clusterer.fit_predict(reduced_data_gpu)


stdout:



stderr:

Traceback (most recent call last):
  File "<string>", line 7, in <module>
  File "/home/ma9o/Desktop/data-analysis/.venv/lib64/python3.10/site-packages/numba/cuda/cudadrv/runtime.py", line 111, in get_version
    self.cudaRuntimeGetVersion(ctypes.byref(rtver))
  File "/home/ma9o/Desktop/data-analysis/.venv/lib64/python3.10/site-packages/numba/cuda/cudadrv/runtime.py", line 65, in __getattr__
    self._initialize()
  File "/home/ma9o/Desktop/data-analysis/.venv/lib64/python3.10/site-packages/numba/cuda/cudadrv/runtime.py", line 51, in _initialize
    self.lib = open_cudalib('cudart')
  File "/home/ma9o/Desktop/data-analysis/.venv/lib64/python3.10/site-packages/numba/cuda/cudadrv/libs.py", line 64, in open_cudalib
    return ctypes.CDLL(path)
  File "/usr/lib64/python3.10/ctypes/__init__.py", line 374, in __init__
    self._handle = _dlopen(self._name, mode)
OSError: libcudart.so: cannot open shared object file: No such file or directory


Not patching Numba
Detected

OSError: libcudart.so: cannot open shared object file: No such file or directory

In [21]:
# CPU only

# import numpy as np
# import umap
# import hdbscan

# embeddings = np.array(df["embedding"].tolist())

# umap_model = umap.UMAP(n_neighbors=15,
#                        n_components=100, 
#                        min_dist=0.1, 
#                        metric='cosine') 
# reduced_data = umap_model.fit_transform(embeddings)

# clusterer = hdbscan.HDBSCAN(min_cluster_size=5, # minimum size of clusters
#                             gen_min_span_tree=True) # useful for visualization
# cluster_labels = clusterer.fit_predict(reduced_data)



In [None]:
# visaulize the clusters using plotly 2d
import plotly.express as px
import plotly.graph_objects as go

fig = px.scatter(x=reduced_data[:, 0], y=reduced_data[:, 1], color=cluster_labels, labels={'color': 'Cluster'})
fig.update_traces(marker=dict(size=3))
fig.show()