In [25]:
from mistralai.client import MistralClient
from openai import OpenAI
import os
import pandas as pd

In [26]:
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [27]:
client = MistralClient(api_key=os.environ["MISTRAL_API_KEY"])
openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def create_embeddings(inputs, model="text-embedding-3-small"):
    if model == "mistral-embed":
        embeddings_batch_response = client.embeddings(
            model="mistral-embed",
            input=inputs,
        )
    else:
        embeddings_batch_response = openai_client.embeddings.create(
            model=model,
            input=inputs,
            encoding_format="float"
        )
    
    return list(map(lambda x: x.embedding, embeddings_batch_response.data))


In [28]:
import os
import re
from datetime import datetime

PROVIDERS = {
    "google": {
        "raw": "Takeout/My Activity/Search",
        "parsed": "google/search_history",
        "summary": "google/search_history_summary",
        "context": ""
    }
}


def get_filenames(
    kind="parsed", start_date="2018-11-07", end_date="2023-02-01", provider="google"
):
    directory = os.path.join("..", "_data", kind, PROVIDERS[provider][kind])
    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")
    file_pattern = r"^(\d{4}-\d{2}-\d{2})\.csv$"

    def is_date_in_range(file_date):
        return start_date <= datetime.strptime(file_date, "%Y-%m-%d") <= end_date

    filenames = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            match = re.match(file_pattern, file)
            if match:
                file_date = match.groups()[0]
                if is_date_in_range(file_date):
                    filenames.append(os.path.join(root, file))

    return filenames

In [29]:
from openai import AsyncOpenAI
import httpx

custom_client = AsyncOpenAI(
  http_client=httpx.AsyncClient(
    limits=httpx.Limits(
      max_connections=256,
      max_keepalive_connections=256
    )
  ),
  base_url="https://3nkk8uq95cvr44-8000.proxy.runpod.net/v1"
)

async def get_completion(prompt):
  try:
    res = await custom_client.chat.completions.create(
      model="mistralai/Mistral-7B-Instruct-v0.2",
      messages=[
        {"role": "user", "content": prompt},
      ]
    )

    return res.choices[0].message.content
  except Exception as e:
    return ""
  

In [30]:
await get_completion("What is the capital of France?")

' The capital city of France is Paris. Paris is the most populous city in France and is known for its iconic landmarks such as the Eiffel Tower, Louvre Museum, Notre-Dame Cathedral, and Montmartre. It is also the political, cultural, and commercial center of France and is renowned for its fashion, gastronomy, art, and culture. Paris has long been a major European political and cultural center, and it continues to be a significant global city today.'

In [31]:
contextualization_prompt = "I will provide a Google search record from my search history. What is the search term referring to? What could have been my intent with such a search? Reply with 'NOT SURE' if you're not sure."

from tqdm.asyncio import tqdm_asyncio

top_dir = os.path.join("..", "_data", "context")

input_df = pd.DataFrame(columns=["date", "time", "raw"])
output_df = pd.DataFrame(columns=["date", "time", "raw", "description"])

for filename in (get_filenames()):
    file_df = pd.read_csv(filename)

    input_df = pd.concat(
            [
                input_df,
                pd.DataFrame(
                    {
                        "date": filename.split("/")[-1].split(".")[0],
                        "time": file_df["hour"],
                        "raw": file_df["title"],
                    }
                ),
            ],
            ignore_index=True,
        )

results = []

for i in range(0, len(input_df)):
    results.append(get_completion(f"{contextualization_prompt}\n{input_df.iloc[i]["raw"]}"))

for i, answer in enumerate(await tqdm_asyncio.gather(*results, smoothing=0)):
    output_df = pd.concat(
        [
            output_df,
            pd.DataFrame(
                {   
                    "date": input_df.iloc[i]["date"],
                    "time": [input_df.iloc[i]["time"]],
                    "raw": [input_df.iloc[i]["raw"]],
                    "description": [answer],
                }
            ),
        ],
        ignore_index=True,
    )



  0%|          | 0/106930 [00:00<?, ?it/s]

In [None]:
import os

for date in output_df['date'].unique():
    date_rows = output_df[output_df['date'] == date]
    out_path = os.path.join(top_dir, date[:-6], date[:-3])
    if not os.path.exists(out_path):
        os.makedirs(out_path)
    date_rows.to_csv(os.path.join(out_path, f"{date}.csv"), index=False)

In [None]:
def parse_md_table(md_table):
    # Split the table into lines
    lines = md_table.replace(",", " ").strip().split('\n')

    # Remove the header separator line (usually the second line)
    lines.pop(1)

    # Replace pipe symbols with commas and trim spaces
    csv_lines = [','.join(cell.strip() for cell in line.split('|')[1:-1]) for line in lines]

    # Join the lines back into a single string
    return '\n'.join(csv_lines)

In [None]:
import requests
import pandas as pd
from io import StringIO

taxonomy_url = "https://raw.githubusercontent.com/patcg-individual-drafts/topics/main/taxonomy_v2.md"

response = requests.get(taxonomy_url)
markdown_text = response.text

# Extract the table content
table_text = parse_md_table(markdown_text)

# Parse the table using pandas
patcg_topics_df = pd.read_csv(StringIO(table_text))

# Convert the dataframe to JSON
patcg_topics = patcg_topics_df.to_json(orient="records")

patcg_topics


'[{"ID":1,"Topic":"\\/Arts & Entertainment"},{"ID":350,"Topic":"\\/Arts & Entertainment\\/Celebrities & Entertainment News"},{"ID":351,"Topic":"\\/Arts & Entertainment\\/Comics & Animation"},{"ID":352,"Topic":"\\/Arts & Entertainment\\/Events & Listings"},{"ID":353,"Topic":"\\/Arts & Entertainment\\/Events & Listings\\/Bars  Clubs & Nightlife"},{"ID":4,"Topic":"\\/Arts & Entertainment\\/Events & Listings\\/Concerts & Music Festivals"},{"ID":354,"Topic":"\\/Arts & Entertainment\\/Events & Listings\\/Event Ticket Sales"},{"ID":355,"Topic":"\\/Arts & Entertainment\\/Events & Listings\\/Expos & Conventions"},{"ID":356,"Topic":"\\/Arts & Entertainment\\/Events & Listings\\/Film Festivals"},{"ID":357,"Topic":"\\/Arts & Entertainment\\/Events & Listings\\/Food & Beverage Events"},{"ID":9,"Topic":"\\/Arts & Entertainment\\/Events & Listings\\/Live Sporting Events"},{"ID":12,"Topic":"\\/Arts & Entertainment\\/Movies"},{"ID":13,"Topic":"\\/Arts & Entertainment\\/Movies\\/Action & Adventure Films

In [None]:
# seed taxonomy to use as root nodes
# added a description to each taxonomy to be embedded

from tqdm.notebook import tqdm

# taxonomy_context_prompt = """
#     Given a single taxon representing a category within a structured taxonomy, generate a descriptive paragraph that captures the potential interests and insights reflected by the taxon. The taxon is a path through a hierarchical classification system, indicating a progression from broad categories to more specific subcategories.

#     To process the taxon:
#     1. Break down the taxon to understand its hierarchical structure, identifying the primary category and any relevant subcategories.
#     2. Reflect on what the interest in this specific taxon suggests about the user's preferences, professional or educational pursuits, or personal hobbies.
#     3. Generate a descriptive paragraph that:
#     - Introduces the user's engagement with the main category and subcategories as indicated by the taxon.
#     - Highlights particular aspects, applications, or implications of the category that might capture the user's interest.
#     - Speculates on the broader significance of this interest, potentially connecting it to current trends, technological innovations, cultural phenomena, or historical contexts, where appropriate.
#     - Maintains a neutral and informative tone, aiming to elucidate insights rather than make assumptions about the user.

#     For example, given this taxon: "/Computers & Electronics/Software/Multimedia Software"

#     Generate a description like this: "This session unveils the user's engagement with the realm of "/Computers & Electronics/Software/Multimedia Software," highlighting an interest in technology, specifically in the tools and applications designed for creating, editing, and managing various forms of digital media. The user's focus on multimedia software suggests a keen interest in the intersection of technology and creativity, exploring how software can facilitate artistic expression, enhance digital content creation, and impact multimedia production. This curiosity may reflect a broader fascination with digital innovation and its role in shaping contemporary media landscapes."

#     Here is the new taxon:
# """

taxonomy_as_search_prompt = "I will provide a Google search record from my search history. What is the search term referring to? What could have been my intent with such a search?"

seed_taxonomy = {"root": "root"}

import os.path
import json

if not os.path.isfile("seed_taxonomy_v2.json"):
    df = patcg_topics_df

    coro = [
        get_completion(f"{taxonomy_as_search_prompt} {row['Topic']}")
        for i, row in df.iterrows()
    ]

    results = await tqdm_asyncio.gather(*coro, smoothing=0)

    for (i, row), res in zip(df.iterrows(), results):
        seed_taxonomy[row["Topic"]] = res

    json.dump(seed_taxonomy, open("seed_taxonomy_v2.json", "w"))

100%|██████████| 469/469 [00:32<00:00, 14.51it/s]


In [1]:
from pgvector.psycopg import register_vector
import psycopg
import os

conn = psycopg.connect(**psycopg.conninfo.conninfo_to_dict(os.environ["DATABASE_URL"]))
conn.autocommit = True

conn.execute("CREATE EXTENSION IF NOT EXISTS vector")
register_vector(conn)

In [2]:
conn.execute("DROP TABLE IF EXISTS documents")
conn.execute(
    """CREATE TABLE documents (
             id bigserial PRIMARY KEY, 
             description text,
             date DATE, 
             time TIME,
             raw text,
             embedding vector(1536),
             is_taxonomy boolean DEFAULT FALSE
    )"""
)



<psycopg.Cursor [COMMAND_OK] [IDLE] (host=localhost port=5433 database=enclaveid) at 0x7fb44279d0d0>

In [3]:
import numpy as np


def store_embeddings(embeddings, metadata):
    for embedding, meta in zip(embeddings, metadata):
        try:

            conn.execute(
                "INSERT INTO documents (description, date, time, raw, embedding, is_taxonomy) VALUES (%s, %s, %s, %s, %s, %s)",
                (
                    meta["description"],
                    meta["date"],
                    meta["time"],
                    meta["raw"],
                    np.array(embedding),
                    meta["is_taxonomy"] if "is_taxonomy" in meta else False,
                ),
            )
        except Exception as e:
            print(e)

In [148]:
import json
seed_taxonomy = json.load(open("seed_taxonomy.json"))

embeddings = create_embeddings(list(seed_taxonomy.values()))
metadatas = [
    {
        "description": seed_taxonomy[taxon],
        "date": "1970-01-01",
        "time": "00:00:00",
        "raw": taxon,
        "is_taxonomy": True
    }
    for taxon in seed_taxonomy.keys()
]

store_embeddings(embeddings, metadatas)

In [11]:
import json
from tqdm.notebook import tqdm

# generate embeddings and save in db
for filename in tqdm(get_filenames("context")):
    df = pd.read_csv(filename)
    date = filename.split("/")[-1].split(".")[0]
    df["date"] = date

    embeddings = create_embeddings(
        [df['description'][i] for i in range(len(df))]
    )

    store_embeddings(
        embeddings,
        json.loads(df.to_json(orient="records")),
    )

  0%|          | 0/178 [00:00<?, ?it/s]

In [150]:
# move ambiguous documents to a separate table

conn.execute("DROP TABLE IF EXISTS ambiguous_documents")

conn.execute("""
CREATE TABLE ambiguous_documents (
    id bigserial PRIMARY KEY,
    description text,
    date DATE,
    time TIME,
    raw text,
    embedding vector(1536),
    is_taxonomy boolean DEFAULT FALSE
)
""")


conn.execute("""
INSERT INTO ambiguous_documents (id, description, date, time, raw, embedding, is_taxonomy)
SELECT id, description, date, time, raw, embedding, is_taxonomy
FROM documents
WHERE description LIKE '%NOT SURE%'
""")

conn.execute("""
DELETE FROM documents
WHERE description LIKE '%NOT SURE%'
""")


<psycopg.Cursor [COMMAND_OK] [IDLE] (host=localhost port=5433 database=enclaveid) at 0x7f6401c80b90>

In [151]:
# Calculate the 10th percentile for time intervals (in seconds)
time_threshold = conn.execute("""
WITH LaggedDocuments AS (
    SELECT
        date,
        time,
        LAG(time) OVER (ORDER BY date, time) AS prev_time
    FROM
        documents
    WHERE
        is_taxonomy = FALSE
),
TimeDifferences AS (
    SELECT
        EXTRACT(EPOCH FROM (time - prev_time)) AS time_diff
    FROM
        LaggedDocuments
    WHERE
        time > prev_time
)
SELECT
    percentile_cont(0.10) WITHIN GROUP (ORDER BY time_diff) AS time_interval_10th
FROM
    TimeDifferences;

""").fetchone()[0]

In [152]:
# Calculate the 90th percentile for embedding similarities using cosine similarity
embedding_similarity_threshold = conn.execute("""
    WITH CosineSimilarities AS (
        SELECT
            date,
            time,
            1 - (embedding <=> LAG(embedding) OVER (ORDER BY date, time)) AS cosine_similarity
        FROM
            documents
        WHERE
            is_taxonomy = FALSE
    ),
    FilteredSimilarities AS (
        SELECT
            cosine_similarity
        FROM
            CosineSimilarities
        WHERE
            cosine_similarity IS NOT NULL
    )
    SELECT
        percentile_cont(0.90) WITHIN GROUP (ORDER BY cosine_similarity) AS embedding_similarity_90th
    FROM
        FilteredSimilarities;
""").fetchone()[0]

In [153]:
print(f"Time threshold: {time_threshold} seconds. Embedding similarity threshold: {embedding_similarity_threshold}")

Time threshold: 60.0 seconds. Embedding similarity threshold: 0.8296323921172489


In [154]:
# merge similar documents within the time and embedding similarity thresholds
records = conn.execute(
    """
SELECT a.id, b.id, (1 - (a.embedding <=> b.embedding)) AS similarity
FROM documents a
JOIN documents b ON a.id < b.id
WHERE ABS(EXTRACT(EPOCH FROM (
    (a.date || ' ' || a.time)::timestamp - 
    (b.date || ' ' || b.time)::timestamp))/60) <= %s
    AND NOT (a.is_taxonomy = TRUE OR b.is_taxonomy = TRUE )
""",
    ((time_threshold / 60),),
).fetchall()


candidates_to_merge = []
for record in records:
    doc_id_a, doc_id_b, similarity = record
    if similarity >= embedding_similarity_threshold:
        candidates_to_merge.append((doc_id_a, doc_id_b))

print(len(candidates_to_merge),candidates_to_merge)  # should be =< the total number of chunks

# Delete the oldest duplicate document
# for a, b in candidates_to_merge:
#     conn.execute("DELETE FROM documents WHERE id = %s", (a,))


516 [(503, 504), (485, 487), (520, 521), (747, 748), (525, 526), (472, 473), (479, 480), (491, 493), (498, 499), (500, 501), (500, 502), (501, 502), (510, 511), (532, 533), (532, 534), (533, 534), (535, 536), (536, 537), (536, 538), (537, 538), (539, 540), (539, 541), (540, 541), (544, 545), (548, 549), (558, 559), (561, 562), (578, 579), (587, 588), (589, 590), (590, 591), (619, 620), (650, 651), (651, 652), (675, 676), (745, 746), (789, 790), (791, 792), (793, 794), (795, 796), (795, 798), (795, 800), (796, 797), (796, 798), (796, 800), (797, 798), (797, 800), (798, 800), (800, 801), (810, 811), (810, 812), (887, 888), (889, 890), (856, 858), (858, 859), (870, 871), (870, 872), (870, 873), (871, 872), (871, 873), (877, 878), (885, 886), (907, 909), (1043, 1044), (933, 934), (941, 942), (955, 956), (968, 969), (970, 971), (974, 975), (988, 989), (994, 995), (1000, 1001), (1007, 1008), (1016, 1017), (1042, 1043), (1042, 1044), (1049, 1050), (1053, 1056), (1054, 1056), (1078, 1079), (10

In [198]:
conn.execute("DROP TABLE IF EXISTS edges")
conn.execute(
    """CREATE TABLE edges (
             id bigserial PRIMARY KEY, 
             parent_id bigint,
             child_id bigint,
             weight float
    )"""
)

<psycopg.Cursor [COMMAND_OK] [IDLE] (host=localhost port=5433 database=enclaveid) at 0x7f63fbfbcdd0>

In [199]:
conn.execute(
    """
CREATE INDEX ON documents
USING hnsw(embedding vector_cosine_ops)
WITH (m = 24, ef_construction = 100);
"""
)

<psycopg.Cursor [COMMAND_OK] [IDLE] (host=localhost port=5433 database=enclaveid) at 0x7f63fbfbcd10>

In [200]:
# create dag of seed taxonomy 

root_id, = conn.execute("SELECT id FROM documents WHERE is_taxonomy and description = 'root'").fetchone()

taxons = conn.execute("SELECT id, raw FROM documents WHERE is_taxonomy and description <> 'root'").fetchall()
tag_to_id = {tag: taxon_id for taxon_id, tag in taxons}


for taxon_id, tag in taxons:
    # Check if the tag represents a root tag (has no '/')
    if tag.count('/') == 1:
        conn.execute("INSERT INTO edges (parent_id, child_id, weight) VALUES (%s, %s, %s)", (root_id, taxon_id, 1.0))
    else:
        # Determine the parent tag by removing the last segment
        parent_tag = '/'.join(tag.split('/')[:-1])
        
        # If the parent_tag is not empty and exists in tag_to_id, it means the taxon has a parent
        if parent_tag and parent_tag in tag_to_id:
            parent_id = tag_to_id[parent_tag]
            
            edge_id = conn.execute("INSERT INTO edges (parent_id, child_id, weight) VALUES (%s, %s, %s) RETURNING id", (parent_id, taxon_id, 1.0)).fetchone()[0]
            

In [201]:
def create_dag():
    return conn.execute(
        """
    WITH DocumentPairs AS (
        SELECT
            a.id AS doc_id,
            b.id AS compared_doc_id,
            b.is_taxonomy AS compared_is_taxonomy,
            (1 - (a.embedding <=> b.embedding)) AS similarity,
            a.date AS doc_date,
            a.time AS doc_time,
            b.date AS compared_doc_date,
            b.time AS compared_doc_time
        FROM
            documents a
        JOIN
            documents b ON a.id != b.id AND 
                        (a.date > b.date OR (a.date = b.date AND a.time > b.time))
    ),
    RankedPairs AS (
        SELECT
            *,
            ROW_NUMBER() OVER(PARTITION BY doc_id, compared_is_taxonomy ORDER BY similarity DESC) AS rank
        FROM
            DocumentPairs
    ), FilteredPairs1 as (
        SELECT
            doc_id,
            compared_doc_id,
            compared_is_taxonomy,
            similarity
        FROM
            RankedPairs
        WHERE
            rank = 1 AND
            ((compared_is_taxonomy = FALSE AND similarity > 0.6) OR compared_is_taxonomy = TRUE)
    ), FilteredPairs2 as (
        SELECT
            doc_id,
            MAX(compared_doc_id) AS compared_doc_id,
            MAX(similarity) AS similarity
        FROM
            FilteredPairs1
        group by doc_id
    )
    INSERT INTO edges (parent_id, child_id, weight)
    SELECT
        doc_id,
        compared_doc_id,
        1-similarity
    FROM
        FilteredPairs2;
""")

In [202]:
create_dag()

<psycopg.Cursor [COMMAND_OK] [IDLE] (host=localhost port=5433 database=enclaveid) at 0x7f63fbfbca10>

In [203]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from datetime import datetime

min_datetime = conn.execute(
    "SELECT MIN(CAST(date || ' ' || time AS TIMESTAMP)) FROM documents WHERE NOT is_taxonomy"
).fetchone()[0]
max_datetime = conn.execute(
    "SELECT MAX(CAST(date || ' ' || time AS TIMESTAMP)) FROM documents WHERE NOT is_taxonomy"
).fetchone()[0]

def get_node_color(date, time_end):
    if time_end == 'None':
        time_end = "00:00:00"   
    # convert the dates to datetime objects
    date = datetime.strptime(f"{date} {time_end}", "%Y-%m-%d %H:%M:%S").timestamp()


    cmap = plt.get_cmap('coolwarm')  # Choose a colormap
    norm = mcolors.Normalize(vmin=min_datetime.timestamp(), vmax=max_datetime.timestamp())  # Normalize the dates

    # Convert the date to a float value between 0 and 1
    date_value = norm(date)

    # Get the corresponding color from the colormap
    color = cmap(date_value)

    # Convert the color to a hex value
    return mcolors.to_hex(color)


In [210]:
import networkx as nx
import matplotlib.pyplot as plt

# Create a new directed graph
dag = nx.DiGraph()

# Add nodes to the graph
for row in conn.execute("SELECT id, description, date, time, is_taxonomy, raw FROM documents"):
    if row[4]:
        dag.add_node(row[0], label=str(row[5]), title=row[1], color="#00FF00")
    else:
        dag.add_node(row[0], label=str(row[5]), title=row[1], color=get_node_color(str(row[2]), str(row[3])))

In [211]:
# Add edges to the graph
for row in conn.execute("""
                        SELECT parent_id, child_id, weight FROM edges
                        """): #WHERE parent_id IS NOT NULL
    parent_id, child_id, weight = row
    dag.add_edge(parent_id, child_id, weight=weight)

In [212]:
is_dag = nx.is_directed_acyclic_graph(dag)

if not is_dag:
    print(list(nx.simple_cycles(dag)))

for layer, nodes in enumerate(nx.topological_generations(dag)):
    # `multipartite_layout` expects the layer as a node attribute, so add the
    # numeric layer value as a node attribute
    for node in nodes:
        dag.nodes[node]["layer"] = layer

In [213]:
nx.write_graphml(dag, "../_data/dag_multi.graphml") # for graphia