In [1]:
# I manually created the "docs" section of docexplore.json from https://classics.mit.edu/Tzu/artwar.html
# Let's load that

import json

with open("docexplore.json") as handle:
    data = json.load(handle)

In [2]:
# Set up langchain embeddings
import os
from langchain.storage.file_system import LocalFileStore
from langchain_openai import OpenAIEmbeddings
from langchain.embeddings.cache import CacheBackedEmbeddings

file_store = LocalFileStore(os.path.expanduser('~/.langchain-embeddings'))
base = OpenAIEmbeddings()
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(base, file_store, namespace=base.model)

In [3]:
# Define classify() and cluster() functions
import numpy as np
from typing import List
from sklearn.cluster import BisectingKMeans
from sklearn.metrics import silhouette_score, silhouette_samples


def classify(docs: List[str], topics: List[str], **kwargs):
    doc_embed = np.array(cached_embeddings.embed_documents(docs))
    topic_embed = np.array(cached_embeddings.embed_documents(topics))
    return np.dot(doc_embed, topic_embed.T)


def cluster(docs: List[str], n: int = 20, **kwargs):
    # Cluster the documents
    cluster_model = BisectingKMeans(init='k-means++', n_clusters=n, n_init=10, max_iter=1000)
    doc_embed = np.array(cached_embeddings.embed_documents(docs))
    cluster_model.fit(doc_embed)
    # Calculate the distance from each point to each centroid
    distances = np.linalg.norm(doc_embed[:, np.newaxis] - cluster_model.cluster_centers_, axis=2)
    return {
        "label": cluster_model.labels_,
        "score": silhouette_score(doc_embed, cluster_model.labels_),
        "scores": silhouette_samples(doc_embed, cluster_model.labels_),
        "centroid": np.argmin(distances, axis=0),
    }

In [4]:
# Create topics by clustering
import pandas as pd
docs = pd.DataFrame(data['docs'])
result = cluster(docs['para'].tolist(), n=25)
docs['cluster'] = result['label']
docs['score'] = result['scores']
clusters = (
    docs.groupby('cluster')
    .apply(lambda group: group.nlargest(3, 'score')['para'].tolist())
    .tolist()
)



In [6]:
import json
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage

chat_model = ChatOpenAI(model='gpt-4-1106-preview', temperature=0)
messages = [
    HumanMessage(
        content=f'''Here are clusters of paragraphs from The Art of War.
Suggest 2-4 word topic names for each cluster.
Return a JSON string array of length {len(clusters)}.

{json.dumps(clusters, indent=2)}'''
    )
]

In [7]:
# Get the ChatGPT response
subtopic_response = chat_model.invoke(messages)

In [11]:
# Extract everything inside ```json ... ```
import re

match = re.search(r'```json(.*?)```', subtopic_response.content, re.DOTALL)
subtopics = json.loads(match.group(1) if match else subtopic_response.content)
subtopics

['Uncelebrated Victory',
 'Strategic Calculations',
 'Effortless Conquest',
 'Military Foundations',
 "General's Conduct",
 'Leadership Failures',
 'Cost of Warfare',
 'Discipline and Punishment',
 'Deception and Intelligence',
 'Espionage Tactics',
 'Infinite Combinations',
 'Rejecting Superstition',
 'Terrain Strategies',
 'Types of Ground',
 'Fire Attack Timing',
 'Stealth and Speed',
 'River Warfare',
 'Logistics and Strategy',
 'Desperate Troops',
 'Marching Dynamics',
 'Territorial Strategy',
 'Momentum of Force',
 'Psychological Warfare',
 'Offense and Defense',
 'Partial Knowledge']

In [12]:
# Create higher-level topic groups
size = int(len(subtopics) ** 0.5)
messages = [
    HumanMessage(
        content=f'''Cluster these topics into {size} groups.
Return a JSON object with keys as a 2-4 word group name and values as arrays of topics.

{json.dumps(subtopics, indent=2)}'''
    )
]

In [13]:
# Get the ChatGPT response
topic_response = chat_model.invoke(messages)

In [14]:
match = re.search(r'```json(.*?)```', topic_response.content, re.DOTALL)
topics = json.loads(match.group(1) if match else topic_response.content)
topics

{'Warfare Strategies': ['Strategic Calculations',
  'Effortless Conquest',
  'Terrain Strategies',
  'Types of Ground',
  'Fire Attack Timing',
  'River Warfare',
  'Territorial Strategy',
  'Momentum of Force',
  'Offense and Defense'],
 'Military Tactics': ['Deception and Intelligence',
  'Espionage Tactics',
  'Stealth and Speed',
  'Psychological Warfare'],
 'Leadership and Command': ["General's Conduct",
  'Leadership Failures',
  'Discipline and Punishment',
  'Marching Dynamics'],
 'Logistics and Preparation': ['Military Foundations',
  'Logistics and Strategy',
  'Desperate Troops'],
 'Intellectual Approaches': ['Uncelebrated Victory',
  'Infinite Combinations',
  'Rejecting Superstition',
  'Partial Knowledge',
  'Cost of Warfare']}

In [15]:
data['topics'] = [
    {'topic': topic, 'subtopic': subtopic}
    for topic, subtopics in topics.items()
    for subtopic in subtopics
]

In [16]:
# Loop through each row and column in maches and create a {doc, topic, similarity} list
min_similarity = 0.75
matches = data['matches'] = []
similarity = classify(
    [row['para'] for row in data['docs']],
    [row['subtopic'] for row in data['topics']]
)
for row in range(len(similarity)):
    for col in range(len(similarity[row])):
        if similarity[row][col] > min_similarity:
            matches.append({'doc': row, 'topic': col, 'similarity': similarity[row][col]})

In [17]:
# Save as docexplore.json
with open("docexplore.json", "w") as handle:
    handle.write(json.dumps(data, indent=2))