In [1]:
# I manually created the "docs" section of data.json from https://classics.mit.edu/Tzu/artwar.html
# Let's load that

import json

with open("data.json") as handle:
    data = json.load(handle)

In [3]:
# Set up langchain embeddings
import os
from langchain.embeddings.cache import CacheBackedEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.storage.file_system import LocalFileStore
from gramex.config import variables

file_store = LocalFileStore(os.path.join(variables['GRAMEXDATA'], 'langchain-embeddings'))
base = OpenAIEmbeddings()
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(base, file_store, namespace=base.model)

In [4]:
# Define classify() and cluster() functions
import numpy as np
from typing import List
from sklearn.cluster import BisectingKMeans
from sklearn.metrics import silhouette_score, silhouette_samples


def classify(docs: List[str], topics: List[str], **kwargs):
    doc_embed = np.array(cached_embeddings.embed_documents(docs))
    topic_embed = np.array(cached_embeddings.embed_documents(topics))
    return np.dot(doc_embed, topic_embed.T)


def cluster(docs: List[str], n: int = 20, **kwargs):
    # Cluster the documents
    cluster_model = BisectingKMeans(init='k-means++', n_clusters=n, n_init=10, max_iter=1000)
    doc_embed = np.array(cached_embeddings.embed_documents(docs))
    cluster_model.fit(doc_embed)
    # Calculate the distance from each point to each centroid
    distances = np.linalg.norm(doc_embed[:, np.newaxis] - cluster_model.cluster_centers_, axis=2)
    return {
        "label": cluster_model.labels_,
        "score": silhouette_score(doc_embed, cluster_model.labels_),
        "scores": silhouette_samples(doc_embed, cluster_model.labels_),
        "centroid": np.argmin(distances, axis=0),
    }

In [10]:
# Create topics by clustering
import pandas as pd
docs = pd.DataFrame(data['docs'])
result = cluster(docs['para'].tolist(), n=25)
docs['cluster'] = result['label']
docs['score'] = result['scores']
clusters = (
    docs.groupby('cluster')
    .apply(lambda group: group.nlargest(3, 'score')['para'].tolist())
    .tolist()
)



In [12]:
import json
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage

chat_model = ChatOpenAI(model='gpt-4-1106-preview', temperature=0)
messages = [
    HumanMessage(
        content=f'''Here are clusters of paragraphs from The Art of War.
Suggest 2-4 word topic names for each cluster.
Return a JSON string array of length {len(clusters)}.

{json.dumps(clusters, indent=2)}'''
    )
]

In [13]:
# Get the ChatGPT response
subtopic_response = chat_model.invoke(messages)

In [14]:
# Extract everything inside ```json ... ```
import re

match = re.search(r'```json(.*?)```', subtopic_response.content, re.DOTALL)
subtopics = json.loads(match.group(1) if match else subtopic_response.content)
subtopics

['Art of Maneuvering',
 'Using the Conquered',
 'Excellence in Victory',
 'Decisive Combat Strategy',
 'Knowledge of Enemy',
 'Importance of War',
 'Art of Command',
 'Ruler and General',
 "General's Responsibility",
 'Costs of War',
 'Utilizing Spies',
 'Infinite Combinations',
 'Principles of Warfare',
 'Leadership Qualities',
 'Adapting to Circumstances',
 'Foraging and Spoils',
 'Marching and Speed',
 'Signs of Distress',
 'Strategic Positions',
 'Engagement Strategies',
 'Invasion Principles',
 'Types of Ground',
 'Weather and Warfare',
 'River Crossings',
 'Attacking with Fire']

In [15]:
# Create higher-level topic groups
size = int(len(subtopics) ** 0.5)
messages = [
    HumanMessage(
        content=f'''Cluster these topics into {size} groups.
Return a JSON object with keys as a 2-4 word group name and values as arrays of topics.

{json.dumps(subtopics, indent=2)}'''
    )
]

In [16]:
# Get the ChatGPT response
topic_response = chat_model.invoke(messages)

In [17]:
match = re.search(r'```json(.*?)```', topic_response.content, re.DOTALL)
topics = json.loads(match.group(1) if match else topic_response.content)
topics

{'Military Strategy': ['Art of Maneuvering',
  'Decisive Combat Strategy',
  'Engagement Strategies',
  'Invasion Principles',
  'Strategic Positions'],
 'Leadership and Command': ['Art of Command',
  'Ruler and General',
  "General's Responsibility",
  'Leadership Qualities',
  'Principles of Warfare'],
 'Intelligence and Espionage': ['Using the Conquered',
  'Knowledge of Enemy',
  'Utilizing Spies',
  'Signs of Distress',
  'Infinite Combinations'],
 'Logistics and Environment': ['Foraging and Spoils',
  'Marching and Speed',
  'Types of Ground',
  'Weather and Warfare',
  'River Crossings'],
 'Tactics and Adaptation': ['Excellence in Victory',
  'Importance of War',
  'Adapting to Circumstances',
  'Costs of War',
  'Attacking with Fire']}

In [18]:
data['topics'] = [
    {'topic': topic, 'subtopic': subtopic}
    for topic, subtopics in topics.items()
    for subtopic in subtopics
]

In [21]:
# Loop through each row and column in maches and create a {doc, topic, similarity} list
min_similarity = 0.75
matches = data['matches'] = []
similarity = classify(
    [row['para'] for row in data['docs']],
    [row['subtopic'] for row in data['topics']]
)
for row in range(len(similarity)):
    for col in range(len(similarity[row])):
        if similarity[row][col] > min_similarity:
            matches.append({'doc': row, 'topic': col, 'similarity': similarity[row][col]})

In [23]:
# Save as data.json
with open("data.json", "w") as handle:
    handle.write(json.dumps(data, indent=2))