In [24]:
from openai import OpenAI
import os
import pandas as pd

In [25]:
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [26]:
START_DATE="2023-08-01"
END_DATE="2023-08-03"

In [27]:
openai_client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

def create_embeddings(inputs):
    embeddings_batch_response = openai_client.embeddings.create(
        model="text-embedding-3-small",
        input=inputs,
        encoding_format="float"
    )
    
    return list(map(lambda x: x.embedding, embeddings_batch_response.data))


In [45]:
import os
import re
from datetime import datetime

PROVIDERS = {
    "google": {
        "raw": "Takeout/My Activity/Search",
        "parsed": "google/search_history",
        "summary": "google/search_history_summary",
        "context": "",
    }
}


def get_filenames(
    kind="parsed", start_date=START_DATE, end_date=END_DATE, provider="google"
):
    directory = os.path.join("..", "_data", kind, PROVIDERS[provider][kind])
    start_date = datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.strptime(end_date, "%Y-%m-%d")
    file_pattern = r"^(\d{4}-\d{2}-\d{2})\.csv$"

    def is_date_in_range(file_date):
        return start_date <= datetime.strptime(file_date, "%Y-%m-%d") <= end_date

    filenames = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            match = re.match(file_pattern, file)
            if match:
                file_date = match.groups()[0]
                if is_date_in_range(file_date):
                    filenames.append(os.path.join(root, file))

    return filenames

In [29]:
from mistralai.models.chat_completion import ChatMessage
from mistralai.client import MistralClient
import asyncio

client = MistralClient(api_key=os.environ["MISTRAL_API_KEY"])


def get_completion(prompt, model="mistral-tiny"):
    messages = [ChatMessage(role="user", content=prompt)]

    chat_response = client.chat(
        model=model,
        messages=messages,
    )

    return chat_response.choices[0].message.content

In [30]:
import json_repair


def extract_json(text, start_char="[", end_char="]"):
    text = text.replace("\n", "")
    start_index = text.find(start_char)
    end_index = text.rfind(end_char)

    if start_index != -1 and end_index != -1 and start_index < end_index:
        json_text = text[start_index : end_index + 1]
        try:
            json_response = json_repair.loads(json_text)
        except:
            return {}

        return json_response
    else:
        return {}

In [31]:
context_enrichment_prompt = """
Here is a Google search record from my search history. What is the search term referring to? Explain in detail what is the object of the search. Reply "NOT SURE" if you are not sure.
"""

top_dir = os.path.join("..", "_data", "context")

if not os.path.exists(top_dir):
    os.makedirs(top_dir)

from tqdm.auto import tqdm


for filename in tqdm(get_filenames()):
    df = pd.read_csv(filename)  # daily df
    date = filename.split("/")[-1].split(".")[0]

    out_path = os.path.join(top_dir, date[:-6], date[:-3])
    if not os.path.exists(out_path):
        os.makedirs(out_path)

    contexts_df = pd.DataFrame(
        columns=["time", "description"]
    )

    for i in tqdm(range(0, len(df))):
        record = df.iloc[i]["title"]
        hour = df.iloc[i]["hour"]

        answer = get_completion(f"{context_enrichment_prompt}\n{record}")


        contexts_df = pd.concat(
            [contexts_df, pd.DataFrame([{"description": answer, "time": hour}])], ignore_index=True
        )

    contexts_df.to_csv(os.path.join(out_path, f"{date}.csv"))

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/97 [00:00<?, ?it/s]

  0%|          | 0/75 [00:00<?, ?it/s]

  0%|          | 0/71 [00:00<?, ?it/s]

  0%|          | 0/67 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [66]:
from pgvector.psycopg import register_vector
import psycopg
import os

conn = psycopg.connect(**psycopg.conninfo.conninfo_to_dict(os.environ["DATABASE_URL"]))
conn.autocommit = True

In [67]:
conn.execute("CREATE EXTENSION IF NOT EXISTS vector")
register_vector(conn)

In [68]:
conn.execute("DROP TABLE IF EXISTS documents")
conn.execute(
    """CREATE TABLE documents (
             id bigserial PRIMARY KEY, 
             description text,
             date DATE, 
             time TIME,
             tag TEXT,
             parent_id BIGINT,
             embedding vector(1536)
    )"""
)

<psycopg.Cursor [COMMAND_OK] [IDLE] (host=localhost port=5433 database=enclaveid) at 0x7fddd042ab10>

In [69]:
import numpy as np

def store_embeddings(embeddings, metadata):
    for embedding, meta in zip(embeddings, metadata):
        try:
            conn.execute(
                "INSERT INTO documents (description, date, time, tag, parent_id, embedding) VALUES (%s, %s, %s, %s, %s, %s)",
                (
                    meta["description"],
                    meta["date"],
                    meta["time"],
                    meta["tag"] if "tag" in meta else None,
                    meta["parent_id"] if "parent_id" in meta else None,
                    np.array(embedding),
                ),
            )
        except Exception as e:
            print(e)

In [70]:
# seed nodes to be used as centroids

seed_taxonomy = {
    "Lack of context": "Without more information about the specific context of your search, it's difficult to determine which definition applies. If you were looking for information related to any of these definitions, please provide more details so I can give a more accurate answer.",
    "Intellect": """
    Sciences

        Physics: Studying the fundamental principles of the universe, including quantum mechanics and relativity.
        Chemistry: Exploring the properties, composition, and reactions of matter.
        Biology: Understanding living organisms, genetics, and evolution.
        Mathematics: Engaging with abstract concepts, theories, and mathematical proofs.
        Computer Science: Learning about algorithms, data structures, and coding.

    Humanities

        History: Investigating the events, societies, and cultures of the past.
        Philosophy: Delving into the nature of existence, reality, and ethics.
        Literature: Analyzing texts and exploring themes, narratives, and literary techniques.
        Languages: Studying the structure, evolution, and use of languages.
        Art History: Examining visual arts and their historical development.

    Social Sciences

        Psychology: Understanding human behavior, cognition, and emotion.
        Sociology: Studying societies, social behaviors, and cultures.
        Economics: Analyzing the production, distribution, and consumption of goods and services.
        Political Science: Investigating systems of governance, political activities, and theories.
        Anthropology: Exploring human societies, cultures, and their development.

    Applied Sciences and Engineering

        Engineering: Solving practical problems through the application of scientific principles.
        Medicine and Healthcare: Studying the science of healing and the maintenance of health.
        Environmental Science: Understanding the natural environment and how to protect it.
        Data Science: Analyzing complex datasets to extract insights and inform decisions.

    Arts and Creative Pursuits

        Creative Writing: Crafting original works of fiction, poetry, and non-fiction.
        Music Theory: Understanding the structure, progression, and composition of music.
        Theater and Performance Arts: Exploring the art of performance, direction, and production.
        Visual Arts: Engaging with painting, sculpture, and other visual media from a critical perspective.

    Miscellaneous

        Philosophy of Science: Examining the foundations, methods, and implications of science.
        Cognitive Science: Studying the mind and its processes through an interdisciplinary lens.
        Logic and Critical Thinking: Learning to reason clearly and critically evaluate arguments.
        Astronomy and Astrophysics: Exploring celestial phenomena and the universe.
    """,
    "Openness": """
        Artistic Pursuits:

        Painting and Drawing: Engaging in visual arts as a means of expression and creativity.
        Sculpting: Working with clay, metal, or other materials to create three-dimensional art.
        Photography: Exploring different perspectives and capturing moments through the lens.

    Creative Writing:

        Poetry: Writing poems to express emotions, experiences, or vivid imagery.
        Short Stories: Crafting fictional narratives with unique characters and plots.
        Blogging: Sharing personal insights, experiences, or creative writing pieces online.

    Performing Arts:

        Theater: Acting in plays, musicals, or improv groups.
        Dance: Participating in various styles of dance, from ballet to contemporary.
        Music: Playing instruments, singing, songwriting, or composing music.

    Cultural Exploration:

        Traveling: Visiting new countries or regions to experience different cultures firsthand.
        Learning Languages: Studying foreign languages to better understand and communicate with people from other cultures.
        Culinary Exploration: Trying and cooking diverse cuisines from around the world.

    Nature and Outdoor Activities:

        Hiking and Camping: Exploring natural landscapes and enjoying the outdoors.
        Gardening: Cultivating plants, flowers, or vegetables for enjoyment and sustainability.
        Birdwatching: Observing and learning about different species of birds in their natural habitats.

    Spiritual and Philosophical Exploration:

        Meditation and Mindfulness: Practicing techniques to foster a sense of presence and inner peace.
        Philosophical Discussions: Engaging in conversations about life's big questions and various philosophical perspectives.
        Yoga: Combining physical postures, breathing techniques, and meditation for health and relaxation.

    Experimental Hobbies:

        DIY Projects: Creating or building things by hand, from home decor to electronic gadgets.
        Experimental Cooking: Trying out new recipes or inventing dishes with unique ingredients.
        Collecting: Gathering items of personal interest, such as vinyl records, vintage clothing, or antiques.
    """,
}


store_embeddings(
    create_embeddings(list(seed_taxonomy.values())),
    list(
        map(
            lambda x: {
                "description": seed_taxonomy[x],
                "date": "1970-01-01",
                "time": "00:00",
                "tag": x,
                "parent_id": 0
            },
            seed_taxonomy.keys(),
        )
    ),
)

In [71]:
import json

for filename in get_filenames("context"):
    df = pd.read_csv(filename)
    date = filename.split("/")[-1].split(".")[0]
    df["date"] = date

    embeddings = create_embeddings(
        [df['description'][i] for i in range(len(df))]
    )

    store_embeddings(
        embeddings,
        json.loads(df.to_json(orient="records")),
    )

In [78]:
def get_closest_node(document_id):
    return conn.execute(
        """SELECT id FROM documents WHERE 
            id != %(id)s AND (
                date < (SELECT date FROM documents WHERE id = %(id)s) 
                OR (date = (SELECT date FROM documents WHERE id = %(id)s) AND time < (SELECT time FROM documents WHERE id = %(id)s))
            )
            ORDER BY embedding <=> (SELECT embedding FROM documents WHERE id = %(id)s) LIMIT 1""",
        {"id": document_id},
    ).fetchone()


def create_dag():
    for document_id in conn.execute(
        "SELECT id FROM documents WHERE parent_id IS NULL"
    ).fetchall():
        closest_node = get_closest_node(document_id[0])

        conn.execute(
            "UPDATE documents SET parent_id = %(parent_id)s WHERE id = %(id)s",
            {"parent_id": closest_node[0], "id": document_id[0]},
        )

create_dag()

In [79]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

def get_node_color(date, max_date=END_DATE, min_date=START_DATE):
    # convert the dates to datetime objects
    date = datetime.strptime(date, "%Y-%m-%d").timestamp()
    max_date = datetime.strptime(max_date, "%Y-%m-%d").timestamp()
    min_date = datetime.strptime(min_date, "%Y-%m-%d").timestamp()

    cmap = plt.get_cmap('coolwarm')  # Choose a colormap
    norm = mcolors.Normalize(vmin=min_date, vmax=max_date)  # Normalize the dates

    # Convert the date to a float value between 0 and 1
    date_value = norm(date)

    # Get the corresponding color from the colormap
    color = cmap(date_value)

    # Convert the color to a hex value
    return mcolors.to_hex(color)


In [80]:
import networkx as nx
import matplotlib.pyplot as plt

# Create a new directed graph
dag = nx.DiGraph()

# Add nodes to the graph
for row in conn.execute("SELECT id, description, date FROM documents"):
    dag.add_node(row[0], label=str(str(row[2])), title=row[1], color=get_node_color(str(row[2]), str(row[])))

# Add edges to the graph
for row in conn.execute("SELECT id, parent_id FROM documents WHERE parent_id IS NOT NULL"):
    dag.add_edge(row[1], row[0])

In [81]:
is_dag = nx.is_directed_acyclic_graph(dag)

if not is_dag:
    print(list(nx.simple_cycles(dag)))

for layer, nodes in enumerate(nx.topological_generations(dag)):
    # `multipartite_layout` expects the layer as a node attribute, so add the
    # numeric layer value as a node attribute
    for node in nodes:
        dag.nodes[node]["layer"] = layer

In [82]:
from pyvis.network import Network

nt = Network('2048px', '2048px',  notebook=True, directed=True, layout=True)
nt.from_nx(dag)
nt.show_buttons()
nt.show('../_data/nx_context.html')

Local cdn resources have problems on chrome/safari when used in jupyter-notebook. 
