In [57]:
from mistralai.client import MistralClient
from openai import OpenAI
import os
import pandas as pd

In [58]:
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [59]:
START_DATE="2023-06-01"
END_DATE="2023-06-10"

In [60]:
client = MistralClient(api_key=os.environ["MISTRAL_API_KEY"])
openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def create_embeddings(inputs, model="text-embedding-3-small"):
    if model == "mistral-embed":
        embeddings_batch_response = client.embeddings(
            model="mistral-embed",
            input=inputs,
        )
    else:
        embeddings_batch_response = openai_client.embeddings.create(
            model=model,
            input=inputs,
            encoding_format="float"
        )
    
    return list(map(lambda x: x.embedding, embeddings_batch_response.data))


In [61]:
import os
import re
from datetime import datetime

PROVIDERS = {
    "google": {
        "raw": "Takeout/My Activity/Search",
        "parsed": "google/search_history",
        "summary": "google/search_history_summary",
        "context": ""
    }
}


def get_filenames(
    kind="parsed", start_date=START_DATE, end_date=END_DATE, provider="google"
):
    directory = os.path.join("..", "_data", kind, PROVIDERS[provider][kind])
    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")
    file_pattern = r"^(\d{4}-\d{2}-\d{2})\.csv$"

    def is_date_in_range(file_date):
        return start_date <= datetime.strptime(file_date, "%Y-%m-%d") <= end_date

    filenames = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            match = re.match(file_pattern, file)
            if match:
                file_date = match.groups()[0]
                if is_date_in_range(file_date):
                    filenames.append(os.path.join(root, file))

    return filenames

In [62]:
from mistralai.models.chat_completion import ChatMessage


def get_completion(prompt, model="mistral-tiny"):
    messages = [ChatMessage(role="user", content=prompt)]

    chat_response = client.chat(
        model=model,
        messages=messages,
    )

    return chat_response.choices[0].message.content

In [63]:
import json

def extract_json(text):
    # Helper function to find the matching closing brace or bracket
    def find_closing(text, open_pos, open_char, close_char):
        balance = 0
        for i in range(open_pos, len(text)):
            if text[i] == open_char:
                balance += 1
            elif text[i] == close_char:
                balance -= 1
                if balance == 0:
                    return i
        return -1

    # Find the start of the JSON object/array
    obj_start = text.find("{")
    arr_start = text.find("[")

    if obj_start == -1 and arr_start == -1:
        return {}, None  # No JSON found

    # Determine which comes first or use -1 if not found
    start_index = obj_start if arr_start == -1 or (obj_start != -1 and obj_start < arr_start) else arr_start
    open_char = "{" if start_index == obj_start else "["
    close_char = "}" if open_char == "{" else "]"

    # Find the matching closing brace/bracket
    end_index = find_closing(text, start_index, open_char, close_char)

    if start_index != -1 and end_index != -1:
        json_text = text[start_index:end_index + 1]
        try:
            json_response = json.loads(json_text)
            return json_response, text[end_index + 1:]
        except json.JSONDecodeError:
            return {}, None  # Handle invalid JSON
    else:
        return {}, None


In [None]:
summary_prompt = """
    Analyze the provided list of Google search records to identify distinct topic groups. For each group, create a summary in the JSON format below. Ensure each summary includes: 

    - `time_start`: The start time of the first search in the group.
    - `time_end`: The end time of the last search in the group.
    - `description`: A detailed account of the searches and site visits, enriched with inferred user intent and additional insights into the topic.
    - `interests`: A list of keywords representing the user's interests based on the searches.

    Each `description` should not only recap the searches but also offer a deeper understanding of what the user might be seeking or the broader context of their inquiries. Group searches based on thematic relevance and timing. 

    Example of JSON output format:

    {
    "time_start": "HH:MM",
    "time_end": "HH:MM",
    "description": "Elaborate on what the user did and why, based on the search terms and visited pages.",
    "interests": ["keyword1", "keyword2"]
    }
    
    Here is a list of searches:
"""
chunk_size = 15

top_dir = os.path.join("..", "_data", "summary", PROVIDERS["google"]["summary"])

if not os.path.exists(top_dir):
    os.makedirs(top_dir)

from tqdm.notebook import tqdm

tot_chunks = 0

for filename in tqdm(get_filenames()):
    df = pd.read_csv(filename)  # daily df
    date = filename.split("/")[-1].split(".")[0]

    out_path = os.path.join(top_dir, date[:-6], date[:-3])
    if not os.path.exists(out_path):
        os.makedirs(out_path)

    parsed_df = pd.DataFrame(
        columns=[
            "time_start",
            "time_end",
            "description",
            "interests"
        ]
    )

    for i in tqdm(range(0, len(df), chunk_size)):
        chunk = df.iloc[i : i + chunk_size]

        answer = get_completion(f"{summary_prompt}\n{chunk}")

        # Extract JSON form the answer until no more JSON is found
        parsed_results = []
        while answer:
            parsed_result, answer = extract_json(answer)

            if parsed_result:
                if isinstance(parsed_result, dict):
                    parsed_results.append(parsed_result)
                elif isinstance(parsed_result, list):
                    parsed_results.extend(parsed_result)

        parsed_df = pd.concat(
            [parsed_df, pd.DataFrame(parsed_results)], ignore_index=True
        )

        tot_chunks += 1

    parsed_df.to_csv(os.path.join(out_path, f"{date}.csv"))

tot_chunks

In [65]:
def parse_md_table(md_table):
    # Split the table into lines
    lines = md_table.replace(",", " ").strip().split('\n')

    # Remove the header separator line (usually the second line)
    lines.pop(1)

    # Replace pipe symbols with commas and trim spaces
    csv_lines = [','.join(cell.strip() for cell in line.split('|')[1:-1]) for line in lines]

    # Join the lines back into a single string
    return '\n'.join(csv_lines)

In [66]:
import requests
import pandas as pd
from io import StringIO

taxonomy_url = "https://raw.githubusercontent.com/patcg-individual-drafts/topics/main/taxonomy_v2.md"

response = requests.get(taxonomy_url)
markdown_text = response.text

# Extract the table content
table_text = parse_md_table(markdown_text)

# Parse the table using pandas
patcg_topics_df = pd.read_csv(StringIO(table_text))

# Convert the dataframe to JSON
patcg_topics = patcg_topics_df.to_json(orient="records")

patcg_topics


'[{"ID":1,"Topic":"\\/Arts & Entertainment"},{"ID":350,"Topic":"\\/Arts & Entertainment\\/Celebrities & Entertainment News"},{"ID":351,"Topic":"\\/Arts & Entertainment\\/Comics & Animation"},{"ID":352,"Topic":"\\/Arts & Entertainment\\/Events & Listings"},{"ID":353,"Topic":"\\/Arts & Entertainment\\/Events & Listings\\/Bars  Clubs & Nightlife"},{"ID":4,"Topic":"\\/Arts & Entertainment\\/Events & Listings\\/Concerts & Music Festivals"},{"ID":354,"Topic":"\\/Arts & Entertainment\\/Events & Listings\\/Event Ticket Sales"},{"ID":355,"Topic":"\\/Arts & Entertainment\\/Events & Listings\\/Expos & Conventions"},{"ID":356,"Topic":"\\/Arts & Entertainment\\/Events & Listings\\/Film Festivals"},{"ID":357,"Topic":"\\/Arts & Entertainment\\/Events & Listings\\/Food & Beverage Events"},{"ID":9,"Topic":"\\/Arts & Entertainment\\/Events & Listings\\/Live Sporting Events"},{"ID":12,"Topic":"\\/Arts & Entertainment\\/Movies"},{"ID":13,"Topic":"\\/Arts & Entertainment\\/Movies\\/Action & Adventure Films

In [None]:
# seed taxonomy to use as root nodes
# added a description to each taxonomy to be embedded

from tqdm.notebook import tqdm

taxonomy_context_prompt = """
    Given a single taxon representing a category within a structured taxonomy, generate a descriptive paragraph that captures the potential interests and insights reflected by the taxon. The taxon is a path through a hierarchical classification system, indicating a progression from broad categories to more specific subcategories.

    To process the taxon:
    1. Break down the taxon to understand its hierarchical structure, identifying the primary category and any relevant subcategories.
    2. Reflect on what the interest in this specific taxon suggests about the user's preferences, professional or educational pursuits, or personal hobbies.
    3. Generate a descriptive paragraph that:
    - Introduces the user's engagement with the main category and subcategories as indicated by the taxon.
    - Highlights particular aspects, applications, or implications of the category that might capture the user's interest.
    - Speculates on the broader significance of this interest, potentially connecting it to current trends, technological innovations, cultural phenomena, or historical contexts, where appropriate.
    - Maintains a neutral and informative tone, aiming to elucidate insights rather than make assumptions about the user.

    For example, given this taxon: "/Computers & Electronics/Software/Multimedia Software"

    Generate a description like this: "This session unveils the user's engagement with the realm of "/Computers & Electronics/Software/Multimedia Software," highlighting an interest in technology, specifically in the tools and applications designed for creating, editing, and managing various forms of digital media. The user's focus on multimedia software suggests a keen interest in the intersection of technology and creativity, exploring how software can facilitate artistic expression, enhance digital content creation, and impact multimedia production. This curiosity may reflect a broader fascination with digital innovation and its role in shaping contemporary media landscapes."

    Here is the new taxon: 
"""

seed_taxonomy = {
    "root": "root"
}

import os.path
import json

if not os.path.isfile("seed_taxonomy.json"):
    df = patcg_topics_df

    for i, row in tqdm(df.iterrows(), total=df.shape[0]):
        seed_taxonomy[row["Topic"]] = get_completion(f"{taxonomy_context_prompt} {row['Topic']}", model="mistral-small")

    json.dump(seed_taxonomy, open("seed_taxonomy.json", "w"))

In [68]:
from pgvector.psycopg import register_vector
import psycopg
import os

conn = psycopg.connect(**psycopg.conninfo.conninfo_to_dict(os.environ["DATABASE_URL"]))
conn.autocommit = True

conn.execute("CREATE EXTENSION IF NOT EXISTS vector")
register_vector(conn)

In [69]:
conn.execute("DROP TABLE IF EXISTS documents")
conn.execute(
    """CREATE TABLE documents (
             id bigserial PRIMARY KEY, 
             description text,
             date DATE, 
             time_start TIME,
             time_end TIME,
             interests text[],
             embedding vector(1536),
             parent_id bigint,
             is_taxonomy boolean DEFAULT FALSE
    )"""
)

<psycopg.Cursor [COMMAND_OK] [IDLE] (host=localhost port=5433 database=enclaveid) at 0x7ff67825f350>

In [70]:
import numpy as np

def store_embeddings(embeddings, metadata):
    for embedding, meta in zip(embeddings, metadata):
        interests = list(
                map(
                    lambda x: x.strip(),
                    meta["interests"].replace("[", "").replace("]", "").replace("'", "").split(",") if meta["interests"] else []
                )
            )
        
        try:
            conn.execute(
                "INSERT INTO documents (description, date, time_start, time_end, interests, embedding) VALUES (%s, %s, %s, %s, %s, %s)",
                (
                    meta["description"],
                    meta["date"],
                    meta["time_start"],
                    meta["time_end"],
                    interests,
                    np.array(embedding),
                ),
            )
        except Exception as e:
            print(e)

In [71]:
from datetime import timedelta


seed_taxonomy = json.load(open("seed_taxonomy.json"))

start_date = datetime.strptime("1970-01-01 00:00", "%Y-%m-%d %H:%M")

for taxon, embedding in zip(seed_taxonomy.keys(), create_embeddings(list(seed_taxonomy.values()))):
    
    date_str, time_str = start_date.strftime("%Y-%m-%d %H:%M").split(' ')

    conn.execute(
        "INSERT INTO documents (description, date, time_start, time_end, interests, embedding, parent_id, is_taxonomy) VALUES (%s, %s, %s, %s, %s, %s, %s, %s)",
        (
            seed_taxonomy[taxon],
            date_str,
            time_str,
            time_str,
            [taxon],
            np.array(embedding),
            0 if taxon == "root" else None,
            True
        ),
    )

    #start_date += timedelta(minutes=1)

In [72]:
import json

# generate embeddings and save in db
for filename in get_filenames("summary"):
    df = pd.read_csv(filename)
    date = filename.split("/")[-1].split(".")[0]
    df["date"] = date

    embeddings = create_embeddings(
        [f"{df['description'][i]} Tags:{df['interests'][i]}" for i in range(len(df))]
    )

    store_embeddings(
        embeddings,
        json.loads(df.to_json(orient="records")),
    )

In [73]:
# Calculate the 10th percentile for time intervals (in seconds)
time_threshold = conn.execute("""
WITH LaggedDocuments AS (
    SELECT
        date,
        time_start,
        time_end,
        LAG(time_end) OVER (ORDER BY date, time_start) AS prev_time_end
    FROM
        documents
    WHERE
        NOT is_taxonomy
),
TimeDifferences AS (
    SELECT
        EXTRACT(EPOCH FROM (time_start - prev_time_end)) AS time_diff
    FROM
        LaggedDocuments
    WHERE
        time_start > prev_time_end
)
SELECT
    percentile_cont(0.10) WITHIN GROUP (ORDER BY time_diff) AS time_interval_10th
FROM
    TimeDifferences;

""").fetchone()[0]

In [74]:
# Calculate the 90th percentile for embedding similarities using cosine similarity
embedding_similarity_threshold = conn.execute("""
    WITH CosineSimilarities AS (
        SELECT
            date,
            time_start,
            1 - (embedding <=> LAG(embedding) OVER (ORDER BY date, time_start)) AS cosine_similarity
        FROM
            documents
        WHERE
            NOT is_taxonomy
    ),
    FilteredSimilarities AS (
        SELECT
            cosine_similarity
        FROM
            CosineSimilarities
        WHERE
            cosine_similarity IS NOT NULL
    )
    SELECT
        percentile_cont(0.90) WITHIN GROUP (ORDER BY cosine_similarity) AS embedding_similarity_90th
    FROM
        FilteredSimilarities;
""").fetchone()[0]

In [75]:
print(f"Time threshold: {time_threshold} seconds. Embedding similarity threshold: {embedding_similarity_threshold}")

Time threshold: 60.0 seconds. Embedding similarity threshold: 0.749131862173034


In [76]:
# merge similar documents within the time and embedding similarity thresholds
records = conn.execute(
    """
SELECT a.id, b.id, (1 - (a.embedding <=> b.embedding)) AS similarity
FROM documents a
JOIN documents b ON a.id < b.id
WHERE ABS(EXTRACT(EPOCH FROM (
    (a.date || ' ' || a.time_end)::timestamp - 
    (b.date || ' ' || b.time_start)::timestamp))/60) <= %s
    AND a.is_taxonomy = FALSE
    AND b.is_taxonomy = FALSE
""",
    ((time_threshold/60),),
).fetchall()

candidates_to_merge = []
for record in records:
    doc_id_a, doc_id_b, similarity = record
    if similarity >= embedding_similarity_threshold:
        candidates_to_merge.append((doc_id_a, doc_id_b))

len(candidates_to_merge)  # should be =< the total number of chunks

17

In [77]:
for a, b in candidates_to_merge:
    # Update time_end of document a with the maximum time_end of both documents
    conn.execute(
        """
        UPDATE documents
        SET time_end = (SELECT GREATEST(max_a.time_end, max_b.time_end)
                        FROM (SELECT time_end FROM documents WHERE id = %s) as max_a,
                             (SELECT time_end FROM documents WHERE id = %s) as max_b)
        WHERE id = %s
        """,
        (a, b, a),
    )
    # Update time_start of document a with the minimum time_start of both documents
    conn.execute(
        """
        UPDATE documents
        SET time_start = (SELECT LEAST(min_a.time_start, min_b.time_start)
                          FROM (SELECT time_start FROM documents WHERE id = %s) as min_a,
                               (SELECT time_start FROM documents WHERE id = %s) as min_b)
        WHERE id = %s
        """,
        (a, b, a),
    )
    # Delete the duplicate document
    conn.execute("DELETE FROM documents WHERE id = %s", (b,))



In [78]:
# create dag of seed taxonomy 

root_id, = conn.execute("SELECT id FROM documents WHERE is_taxonomy and description = 'root'").fetchone()

taxons = conn.execute("SELECT id, interests[1] FROM documents WHERE is_taxonomy and description <> 'root'").fetchall()
tag_to_id = {tag: taxon_id for taxon_id, tag in taxons}


for taxon_id, tag in taxons:
    # Check if the tag represents a root tag (has no '/')
    if tag.count('/') == 1:
        # This is a root tag, so set parent_id to 0
        conn.execute("UPDATE documents SET parent_id = %s WHERE id = %s", (root_id, taxon_id))
    else:
        # Determine the parent tag by removing the last segment
        parent_tag = '/'.join(tag.split('/')[:-1])
        
        # If the parent_tag is not empty and exists in tag_to_id, it means the taxon has a parent
        if parent_tag and parent_tag in tag_to_id:
            parent_id = tag_to_id[parent_tag]
            # Update the taxon's parent_id
            conn.execute("UPDATE documents SET parent_id = %s WHERE id = %s", (parent_id, taxon_id))


In [79]:
def get_closest_node(document_id):
    return conn.execute(
        """SELECT id FROM documents WHERE 
            id != %(id)s AND (
                date < (SELECT date FROM documents WHERE id = %(id)s) 
                OR (date = (SELECT date FROM documents WHERE id = %(id)s) AND time_end < (SELECT time_start FROM documents WHERE id = %(id)s))
            )
            ORDER BY embedding <=> (SELECT embedding FROM documents WHERE id = %(id)s) LIMIT 1""",
        {"id": document_id},
    ).fetchone()


def create_dag():
    for document_id in conn.execute(
        "SELECT id FROM documents WHERE parent_id IS NULL"
    ).fetchall():
        try:
            closest_node = get_closest_node(document_id[0])

            conn.execute(
                "UPDATE documents SET parent_id = %(parent_id)s WHERE id = %(id)s",
                {"parent_id": closest_node[0], "id": document_id[0]},
            )
        except:
            print(f"Error with document {document_id[0]}")

In [80]:
create_dag()

In [87]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

min_datetime = conn.execute(
    "SELECT MIN(CAST(date || ' ' || time_end AS TIMESTAMP)) FROM documents WHERE NOT is_taxonomy"
).fetchone()[0]
max_datetime = conn.execute(
    "SELECT MAX(CAST(date || ' ' || time_end AS TIMESTAMP)) FROM documents WHERE NOT is_taxonomy"
).fetchone()[0]

def get_node_color(date, time_end):
    if time_end == 'None':
        time_end = "00:00:00"   
    # convert the dates to datetime objects
    date = datetime.strptime(f"{date} {time_end}", "%Y-%m-%d %H:%M:%S").timestamp()


    cmap = plt.get_cmap('coolwarm')  # Choose a colormap
    norm = mcolors.Normalize(vmin=min_datetime.timestamp(), vmax=max_datetime.timestamp())  # Normalize the dates

    # Convert the date to a float value between 0 and 1
    date_value = norm(date)

    # Get the corresponding color from the colormap
    color = cmap(date_value)

    # Convert the color to a hex value
    return mcolors.to_hex(color)


In [88]:
import networkx as nx
import matplotlib.pyplot as plt

# Create a new directed graph
dag = nx.DiGraph()

# Add nodes to the graph
for row in conn.execute("SELECT id, description, date, time_end, is_taxonomy FROM documents"):
    if row[4]:
        dag.add_node(row[0], label=str(row[2]), title=row[1], color="#00FF00")
    else:
        dag.add_node(row[0], label=str(row[2]), title=row[1], color=get_node_color(str(row[2]), str(row[3])))

# Add edges to the graph
for row in conn.execute("SELECT id, parent_id FROM documents WHERE parent_id IS NOT NULL"):
    dag.add_edge(row[1], row[0])

In [89]:
is_dag = nx.is_directed_acyclic_graph(dag)

if not is_dag:
    print(list(nx.simple_cycles(dag)))

for layer, nodes in enumerate(nx.topological_generations(dag)):
    # `multipartite_layout` expects the layer as a node attribute, so add the
    # numeric layer value as a node attribute
    for node in nodes:
        dag.nodes[node]["layer"] = layer

In [90]:
from pyvis.network import Network

nt = Network('2048px', '2048px',  notebook=True, directed=True, layout=True)
nt.from_nx(dag)
nt.show_buttons()
nt.show('../_data/nx.html')

Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 
