In [1]:
%pip install \
    --extra-index-url=https://pypi.nvidia.com \
    cudf-cu11==24.2.* dask-cudf-cu11==24.2.* cuml-cu11==24.2.* hdbscan pandas numpy tqdm seaborn matplotlib

Looking in indexes: https://pypi.org/simple, https://pypi.nvidia.com
Collecting tqdm
  Downloading tqdm-4.66.2-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Downloading tqdm-4.66.2-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.66.2
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [43]:
import os
import re
from datetime import datetime

PROVIDERS = {
    "google": {
        "raw": "Takeout/My Activity/Search",
        "parsed": "google/search_history",
        "summary": "google/search_history_summary",
        "context": "",
        "interests": ""
    }
}


def get_filenames(
    kind="parsed", start_date=None, end_date=None, provider="google"
):
    directory = os.path.join(kind, PROVIDERS[provider][kind])
    if start_date is not None:
        start_date = datetime.strptime(start_date, "%Y-%m-%d")
    if end_date is not None:
        end_date = datetime.strptime(end_date, "%Y-%m-%d")
    file_pattern = r"^(\d{4}-\d{2}-\d{2})\.(csv|json)$"

    def is_date_in_range(file_date):
        if start_date is None and end_date is None:
            return True
        else:
            return start_date <= datetime.strptime(file_date, "%Y-%m-%d") <= end_date

    filenames = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            match = re.match(file_pattern, file)
            if match:
                file_date = match.groups()[0]
                if is_date_in_range(file_date):
                    filenames.append(os.path.join(root, file))

    return filenames

In [44]:
# load all parsed data into a single dataframe
import pandas as pd
import numpy as np
import json
import os
from tqdm.notebook import tqdm

df = pd.DataFrame(columns=["date", "interest", "embedding"])

for filename in tqdm(get_filenames("interests")):
    date = filename.split("/")[-1].split(".")[0]

    if not os.path.isfile(f"embeddings/{date}.npy"):
        continue

    tmp_df = pd.DataFrame(columns=["date", "interest", "embedding"])
    
    embeddings = np.load(f"embeddings/{date}.npy")
    interests = json.load(open(f"interests/{date}.json"))

    tmp_df["date"] = [date] * len(interests)
    tmp_df["interest"] = interests
    tmp_df["embedding"] = embeddings.tolist()

    df = pd.concat([df, tmp_df])


  0%|          | 0/1506 [00:00<?, ?it/s]

In [None]:
from cuml.metrics import pairwise_distances
from hdbscan import HDBSCAN
import numpy as np
import cupy as cp  
import cuml

embeddings_gpu = cp.asarray(df["embedding"].to_list())

umap_model = cuml.UMAP(n_neighbors=15,
                       n_components=100, 
                       min_dist=0.1, 
                       metric='cosine')
reduced_data_gpu = umap_model.fit_transform(embeddings_gpu)

cosine_dist = pairwise_distances(reduced_data_gpu, metric='cosine')

clusterer = HDBSCAN(min_cluster_size=5, 
                    gen_min_span_tree=True,
                    metric="precomputed",
                    cluster_selection_epsilon=0.02) 
cluster_labels = clusterer.fit_predict(cosine_dist.astype(np.float64).get())

In [348]:
len(np.unique(cluster_labels))

55

In [350]:
res_df = pd.DataFrame({"interest": df["interest"], "cluster": cluster_labels})

In [51]:
import networkx as nx
nx.write_graphml(clusterer.condensed_tree_.to_networkx(),"condensed_tree.graphml")
nx.write_graphml(clusterer.single_linkage_tree_.to_networkx(), "single_linkage_tree.graphml")
