In [1]:
%pip install \
    --extra-index-url=https://pypi.nvidia.com \
    cudf-cu11==24.2.* dask-cudf-cu11==24.2.* cuml-cu11==24.2.* \
    cugraph-cu11==24.2.* cuspatial-cu11==24.2.* cuproj-cu11==24.2.* \
    cuxfilter-cu11==24.2.* cucim-cu11==24.2.* pylibraft-cu11==24.2.* \
    raft-dask-cu11==24.2.*



Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting cudf-cu11==24.2.*
  Downloading https://pypi.nvidia.com/cudf-cu11/cudf_cu11-24.2.1-cp310-cp310-manylinux_2_28_x86_64.whl (460.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m460.9/460.9 MB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting dask-cudf-cu11==24.2.*
  Downloading https://pypi.nvidia.com/dask-cudf-cu11/dask_cudf_cu11-24.2.1-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.1/45.1 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cuml-cu11==24.2.*
  Downloading https://pypi.nvidia.com/cuml-cu11/cuml_cu11-24.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1206.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 GB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[?25hCollecting cugraph-cu11==24.2.*
  Downloading https://pypi.nvidia.com/cugr

In [10]:
import os
import re
from datetime import datetime

PROVIDERS = {
    "google": {
        "raw": "Takeout/My Activity/Search",
        "parsed": "google/search_history",
        "summary": "google/search_history_summary",
        "context": ""
    }
}


def get_filenames(
    kind="parsed", start_date=None, end_date=None, provider="google"
):
    directory = os.path.join(kind, PROVIDERS[provider][kind])
    if start_date is not None:
        start_date = datetime.strptime(start_date, "%Y-%m-%d")
    if end_date is not None:
        end_date = datetime.strptime(end_date, "%Y-%m-%d")
    file_pattern = r"^(\d{4}-\d{2}-\d{2})\.csv$"

    def is_date_in_range(file_date):
        if start_date is None and end_date is None:
            return True
        else:
            return start_date <= datetime.strptime(file_date, "%Y-%m-%d") <= end_date

    filenames = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            match = re.match(file_pattern, file)
            if match:
                file_date = match.groups()[0]
                if is_date_in_range(file_date):
                    filenames.append(os.path.join(root, file))

    return filenames

In [None]:
# load all parsed data into a single dataframe
import pandas as pd
import numpy as np

df = pd.DataFrame(columns=["date", "title", "embedding"])

for filename in get_filenames():
    date = filename.split("/")[-1].split(".")[0]

    try:
        embeddings = np.load(f"embeddings/{date}.npy")

        tmp_df = pd.read_csv(filename)
        tmp_df = tmp_df.drop_duplicates(subset="title")

        tmp_df = tmp_df[~tmp_df["title"].str.contains("Visited ")]
        tmp_df = tmp_df[~tmp_df["title"].str.contains("Used ")]
        tmp_df = tmp_df[~tmp_df["title"].str.contains("Defined ")]

        tmp_df["title"] = tmp_df["title"].str.replace("Searched for ", "")

        tmp_df["date"] = date  
        tmp_df["embedding"] = embeddings.tolist()

        df = pd.concat([df, tmp_df])
    except Exception as e:
        print(e)
        pass

In [None]:
%pip install hdbscan

In [21]:
import numpy as np
import cudf
import cuml
import cupy as cp  # Import cuPy

df_gpu = cudf.DataFrame.from_pandas(df)

embeddings_np = np.array(df["embedding"].to_list())

# Then, convert the NumPy array to a cuPy array
embeddings_gpu = cp.asarray(embeddings_np)
# UMAP for dimensionality reduction
umap_model = cuml.UMAP(n_neighbors=15,
                       n_components=100, 
                       min_dist=0.1, 
                       metric='cosine')
reduced_data_gpu = umap_model.fit_transform(embeddings_gpu)

# HDBSCAN for clustering
clusterer = cuml.cluster.HDBSCAN(min_cluster_size=5, # minimum size of clusters
                                 gen_min_span_tree=True) # useful for visualization, if supported
cluster_labels_gpu = clusterer.fit_predict(reduced_data_gpu)

In [None]:
df_gpu["reduced_data"] = reduced_data_gpu.get().tolist()
df_gpu["cluster_labels"] = cluster_labels_gpu.get().tolist()
df_gpu.index = range(len(df_gpu))
df_gpu.to_json("clustered.json")

In [None]:
import pandas as pd
df = pd.read_json("../_data/clustered.json")

In [None]:
pd.set_option('display.max_rows', df.shape[0]+1)
filtered_df = df[df["cluster_labels"] != -1]
filtered_df = filtered_df[["date", "title", "cluster_labels"]]
filtered_df = filtered_df.sort_values(by=["cluster_labels", "date"])
filtered_df