In [24]:
import pandas as pd
import ast

In [25]:
documents =pd.read_csv('data/satoshi_with_questions.csv')

documents.head()

Unnamed: 0,content,chunk_index,collection,content_type,filename,sequence_number,source,source_file,title,total_chunks,useless_to_recall,questions
0,"Emails\nBitcoin P2P e-cash paperOct 31, 2008, ...",0,emails,email,email_002_Cryptography_Mailing_List,2,nakamotoinstitute_files\emails\email_002_Crypt...,nakamotoinstitute_files\emails\email_002_Crypt...,Cryptography Mailing List,1,False,"[""What is mentioned with regards to Bitcoin's ..."
1,Emails\n[p2p-research] Bitcoin open source imp...,0,emails,email,email_004_P2P_Research_List,4,nakamotoinstitute_files\emails\email_004_P2P_R...,nakamotoinstitute_files\emails\email_004_P2P_R...,P2P Research List,1,False,"[""What was the release date of Bitcoin's open ..."
2,From: | Satoshi Nakamoto | Subject: | Bitcoin ...,0,emails,email,email_010_Bitcoin_P2P_e-cash_paper,10,nakamotoinstitute_files\emails\email_010_Bitco...,nakamotoinstitute_files\emails\email_010_Bitco...,Bitcoin P2P e-cash paper,1,False,"[""What is Satoshi Nakamoto's view on using cry..."
3,From: | Satoshi Nakamoto | Subject: | [bitcoin...,0,emails,email,email_022_[bitcoin-list]_Welcome,22,nakamotoinstitute_files\emails\email_022_[bitc...,nakamotoinstitute_files\emails\email_022_[bitc...,[bitcoin-list] Welcome,1,False,['Was Bitcoin introduced as a form of payment?...
4,From: | Satoshi Nakamoto | Subject: | [bitcoin...,0,emails,email,email_024_[bitcoin-list]_Bitcoin_v0.1.2_now_av...,24,nakamotoinstitute_files\emails\email_024_[bitc...,nakamotoinstitute_files\emails\email_024_[bitc...,[bitcoin-list] Bitcoin v0.1.2 now available,1,False,"[""Was the development of Bitcoin affected by t..."


In [9]:
documents.questions.iloc[0][0]

'['

In [None]:
# Common problem I had in GraphViewer project as well: Once the list of strings were stored as csv it was a a string.
# So we need to parse it back to a list of strings.

ast.literal_eval(documents.questions.iloc[0])

["What is mentioned with regards to Bitcoin's version number in these emails?",
 'Was Bitcoin v0.1 released on Jan 8, 2009?',
 'When was the v0.1 release of Bitcoin?']

In [26]:
# Helper function from Grok 
# there were some malformed strings that cause regular literal_eval to fail.
def safe_literal_eval(s):
    """Safely evaluate a string as a Python literal, with fallback options."""
    try:
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        # Try to fix common issues
        try:
            # Replace double quotes with single quotes for consistency
            fixed_s = s.replace('""', "'").replace('"', "'")
            return ast.literal_eval(fixed_s)
        except (ValueError, SyntaxError):
            try:
                # Try to parse as JSON if it's valid JSON
                import json
                return json.loads(s)
            except (ValueError, json.JSONDecodeError):
                # Last resort: return the string as a single-item list
                print(f"Warning: Could not parse: {s[:50]}...")
                return [str(s)]

documents['questions'] = documents['questions'].apply(safe_literal_eval)

  Invali...
  Invali...


## Manually inspect exact duplicates

In [30]:
# documents['questions'] = documents['questions'].apply(ast.literal_eval)


all_questions = []
all_questions_index = [] # New variable to store the original row index for each question
for i, question_entry in enumerate(documents['questions']):
    # The 'questions' column is expected to contain lists of strings after
    # the safe_literal_eval application. However, if there were issues
    # or if some entries remained as string representations of lists,
    # `extend()` would iterate over characters of the string.
    # To ensure robustness, we re-apply `safe_literal_eval` to each entry.
    # This guarantees that `parsed_questions` is always a list of strings,
    # even if `question_entry` was a string representation of a list,
    # or a single unparseable string (which safe_literal_eval wraps in a list).
    # We cast to `str()` first, as `safe_literal_eval` expects a string input.
    parsed_questions = safe_literal_eval(str(question_entry))
    all_questions.extend(parsed_questions)
    # For each question added, record its original row index
    all_questions_index.extend([i] * len(parsed_questions)) # to add an entry for every question in the list


# Calculate statistics
total_questions = len(all_questions)
unique_questions = len(set(all_questions))
duplicate_count = total_questions - unique_questions

print(f"Total questions: {total_questions}")
print(f"Unique questions: {unique_questions}")
print(f"Duplicate questions: {duplicate_count}") # "Hardcore" duplicates 


  Invali...
  Invali...
Total questions: 3601
Unique questions: 3573
Duplicate questions: 28


In [32]:
# Build a dataframe of all questions and their originating row index
df_q = pd.DataFrame({
    'question': all_questions,
    'doc_index': all_questions_index
})

# Keep all rows whose question appears more than once
duplicate_questions_df = df_q[df_q.duplicated('question', keep=False)].copy()

# Fetch the corresponding content using positional index (matches enumerate)
duplicate_questions_df['content'] = duplicate_questions_df['doc_index'].apply(lambda i: documents.iloc[i]['content'])

# Optional: sort for easier review
duplicate_questions_df = duplicate_questions_df.sort_values(['question', 'doc_index']).reset_index(drop=True)

duplicate_questions_df

Unnamed: 0,question,doc_index,content
0,,206,"Right, that is quite a bit better.Can you give..."
1,,835,"CPU power gangs up on one.Â Instead, all netw..."
2,Did Satoshi Nakamoto discuss peer-to-peer netw...,115,"davidonpda, were you also running laszlo's bui..."
3,Did Satoshi Nakamoto discuss peer-to-peer netw...,797,Code:diff -u old\main.cpp new\main.cpp--- old\...
4,What did Satoshi Nakamoto say about micropayme...,545,". We have to trust them with\nour privacy, tr..."
5,What did Satoshi Nakamoto say about micropayme...,727,Forgot to add the good part about micropayment...
6,What did Satoshi Nakamoto say about peer-to-pe...,17,Could be. They're talking about the old Chaumi...
7,What did Satoshi Nakamoto say about peer-to-pe...,156,The design outlines a lightweight client that ...
8,What did Satoshi Nakamoto say about peer-to-pe...,212,Gavin's changes look good.Â I think everythin...
9,What did Satoshi Nakamoto say about peer-to-pe...,439,Bitcoin Design\nWe have proposed a system for ...


## Using Embeddings and Cosine Similarity

In [2]:
from dotenv import load_dotenv
load_dotenv()

# from langfuse.openai import AsyncOpenAI
from openai import AsyncOpenAI
from langfuse import observe

from diskcache_decorator import cached

In [3]:
client = AsyncOpenAI(
    base_url="http://localhost:11434/v1",  # Ollama's default local endpoint
    api_key="ollama"  # Dummy key, required but not used by Ollama
)

EMBED_MODEL = "mxbai-embed-large:latest"

In [4]:
# test call

# note that the main difference is we provide "input" instead of "message"
embedding = await client.embeddings.create(
    input="Hello, world!",
    model=EMBED_MODEL,
)

embedding.data[0].embedding[0]  # first numer to not spam screen

0.017286297

In [5]:
# @observe(as_type="generation")
@cached()
async def make_cached_traced_embedding(input):
    embedding = await client.embeddings.create(
    input=input,
    model=EMBED_MODEL,
)
    return embedding.data[0].embedding

In [6]:
embedding = await make_cached_traced_embedding("blubb")
embedding[0]

0.010233218

### Embed entire dataset

In [29]:
def make_dataset(id_document, id_question, question, embedding=None):
    return {
        "id_document": id_document,
        "id_question": id_question,
        "question": question,
        "embedding": embedding
    }

questions_with_embeddings = []
index_for_questions_with_embeddings = {}

# Reminder: within each document, questions are a list of strings, we can index into it.
for i, row in documents.iterrows():
    index_for_questions_with_embeddings[i] = {} # create a nested dict so we can access document/question pairs with double index.
    for q_idx, question in enumerate(row['questions']):
        _ds = make_dataset(i, q_idx, question)
        index_for_questions_with_embeddings[i][q_idx] = _ds
        questions_with_embeddings.append(_ds)

In [None]:
# Now we have two data structures:
# 1. questions_with_embeddings: a list of dicts, each dict contains the question and its embedding. easy to iterate over.
# 2. index_for_questions_with_embeddings: full information about a document. double indexing to access doc/question pair easily.

print(questions_with_embeddings[0])
print(index_for_questions_with_embeddings[0])

{'id_document': 0, 'id_question': 0, 'question': "What is mentioned with regards to Bitcoin's version number in these emails?", 'embedding': None}
{0: {'id_document': 0, 'id_question': 0, 'question': "What is mentioned with regards to Bitcoin's version number in these emails?", 'embedding': None}, 1: {'id_document': 0, 'id_question': 1, 'question': 'Was Bitcoin v0.1 released on Jan 8, 2009?', 'embedding': None}, 2: {'id_document': 0, 'id_question': 2, 'question': 'When was the v0.1 release of Bitcoin?', 'embedding': None}}


In [44]:
# See also notebook 5 where we used batching.
import traceback
import asyncio

concurrency_limit = 20  # tune as needed
sem = asyncio.Semaphore(concurrency_limit)

async def do_task(ds):
    try:
        async with sem:
            # mutating this slice will update the original questions_with_embeddings list.
            # The function then just returns none, which means it worked.
            ds['embedding'] = await make_cached_traced_embedding(ds['question'])
    except Exception as e:
        return (ds, e, traceback.format_exc())


tasks = [do_task(ds) for ds in questions_with_embeddings] # test with [:50] or so first
results = await asyncio.gather(*tasks)



In [45]:
import pickle

# Save both variables in a single pickle file
data_to_save = {
    'questions_with_embeddings': questions_with_embeddings,
    'index_for_questions_with_embeddings': index_for_questions_with_embeddings
}

with open('embeddings_data.pkl', 'wb') as f:
    pickle.dump(data_to_save, f)

print("Both variables saved to single pickle file!")
print(f"Saved {len(questions_with_embeddings)} question embeddings")

Both variables saved to single pickle file!
Saved 3601 question embeddings


#### Reload data here:

In [None]:
import pickle

# Load both variables from the single pickle file
with open('embeddings_data.pkl', 'rb') as f:
    loaded_data = pickle.load(f)

# Extract the variables
questions_with_embeddings = loaded_data['questions_with_embeddings']
index_for_questions_with_embeddings = loaded_data['index_for_questions_with_embeddings']

print("Both variables loaded from pickle file!")
print(f"Loaded {len(questions_with_embeddings)} question embeddings")

{'id_document': 0,
 'id_question': 2,
 'question': 'When was the v0.1 release of Bitcoin?',
 'embedding': [0.009810594,
  -0.010785704,
  -0.010530103,
  0.02182247,
  -0.016318895,
  -0.052621063,
  0.032103185,
  -0.026895385,
  0.0055493205,
  0.022246564,
  0.0067950236,
  -0.024393613,
  0.019864906,
  -0.004051955,
  -0.02365089,
  0.0295254,
  -0.03307775,
  -0.011640217,
  -0.029709673,
  0.02107156,
  -0.0017250095,
  -0.0068630925,
  0.008539968,
  -0.014604566,
  -0.0042902855,
  0.056358755,
  0.0040189386,
  0.00027186752,
  0.069573246,
  0.07695853,
  0.0008748637,
  -0.042743254,
  0.022090467,
  -0.023286674,
  -0.023128554,
  -0.014663315,
  -0.006586722,
  -0.028760886,
  0.0029978275,
  -0.0393452,
  0.021167805,
  -0.017185377,
  0.019262424,
  -0.0064069727,
  -0.09313191,
  -0.004496941,
  0.0015789095,
  -0.010053242,
  -0.0030614445,
  -0.020777797,
  -0.031840846,
  -0.009610427,
  0.03971802,
  -0.007276359,
  -0.0058126245,
  -0.025518332,
  -0.022315238,
  

### Calculate Cosine Similarity

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [21]:
def get_cosine_similarity(embedding1, embedding2):
    """
    scikit-learn functions usually expect a 2D array (1,n) instead of (n,)
    we have to turn it into a numpy array and reshape it. 
    (1,-1) means it will have 1 row, and n columns (-1 means it automatically calculates n based on the array lenght).
    [1,2,3] --> [[1,2,3]]
    instead of that I could probably join the two arrays and calculate the pairwise cos-sim, get a 2x2 matrix (where its 1 on the diagonal). but here i just want a single value.
    """
    to_np = lambda e: np.array(e).reshape(1, -1)
    embedding1, embedding2 = to_np(embedding1), to_np(embedding2)
    _cos_sim = cosine_similarity(embedding1, embedding2)
    return _cos_sim[0][0]

In [22]:
get_cosine_similarity(embedding, embedding)

np.float64(0.9999999999999993)

We already checked for exact duplicates when parsing the questions. This is just another way of doing it...

In [None]:
from operator import itemgetter

def find_exact_duplicates(questions_with_embeddings):
    get_question = itemgetter('question')
    seen = set()
    exact_duplicates = set()
    for q in map(get_question, questions_with_embeddings):
        if q in seen:
            exact_duplicates.add(q)
        else:
            seen.add(q)
    return exact_duplicates

"""
version above with itemgetter is just faster and fancier way of doing this:

def find_exact_duplicates(questions_with_embeddings):
    seen, dups = set(), set()
    for x in questions_with_embeddings:
        q = x['question']
        if q in seen:
            dups.add(q)
        else:
            seen.add(q)
    return dups
"""

exact_duplicates = find_exact_duplicates(questions_with_embeddings)
exact_duplicates

{'',
 'Did Satoshi Nakamoto discuss peer-to-peer networks in this email?',
 'What did Satoshi Nakamoto say about micropayments?',
 'What did Satoshi Nakamoto say about peer-to-peer networks?',
 "What is Satoshi Nakamoto's view on peer-to-peer networks?",
 "What is Satoshi's view on peer-to-peer networks?",
 "What was Satoshi Nakamoto's view on peer-to-peer networks?",
 "What was Satoshi's view on peer-to-peer networks?",
 ']'}

### Next: Within each document, find questions that are roughly the same.

In [None]:
# Just an experiment, not used, but to illustrate we want these combinations.
for doc_id in index_for_questions_with_embeddings:
    for question_id in index_for_questions_with_embeddings[doc_id]:
        print(doc_id, question_id)
    break

0 0
0 1
0 2


May look a bit confusing below, try to print out individual elements of these data structures, then it will be clear what we are iterating over.

In [None]:
from itertools import combinations

question_combinations_per_document = {}
question_combinations_per_document_similarity = {}

for doc_id in index_for_questions_with_embeddings:
    # worthwile to look at documentation of combinations.
    # we iterate over the keys of each dictionary (remember each document in the list has a dictionaries for each question), the 2 indicate it will make pairwise combinations.
    question_combinations_per_document[doc_id] = list(combinations(index_for_questions_with_embeddings[doc_id].keys(), 2))
    # Now we use the same structure with lists of combinations but fill in the corresponding similarity scores.
    question_combinations_per_document_similarity[doc_id] = []
    for (q1, q2) in question_combinations_per_document[doc_id]:   # parentheses are optional. indicate that we have tuples here.
        e1 = index_for_questions_with_embeddings[doc_id][q1]['embedding']
        e2 = index_for_questions_with_embeddings[doc_id][q2]['embedding']
        sim = get_cosine_similarity(e1, e2)
        question_combinations_per_document_similarity[doc_id].append(sim)


In [60]:
for (q1, q2), sim in zip(question_combinations_per_document[0], question_combinations_per_document_similarity[0]):
    print(q1, q2, sim)

0 1 0.6864653957328704
0 2 0.7297580536314496
1 2 0.8621906991184569


In [None]:
from pprint import pprint  # simply puts each input on a new line. (and manages indentation or line breaks)

# Has some problems when there is only one or no questions...?
# for doc_id in question_combinations_per_document_similarity:
#     for (q1, q2), similarity_score in zip(question_combinations_per_document[0], question_combinations_per_document_similarity[0]):
#         # Above gives e.g.: 0 1 0.6864653957328704 \n  0 2 0.7297580536314496  \n  1 2 0.8621906991184569
#         # Now also get the corresponding questions for review.
#         q1_question = index_for_questions_with_embeddings[doc_id][q1]['question']
#         q2_question = index_for_questions_with_embeddings[doc_id][q2]['question']
#         if similarity_score > 0.8:
#             # print(similarity_score, q1_question, q2_question)
#             pprint((f"{similarity_score:.2f}", q1_question, q2_question)) # pprint expects a single argument. thus the double parentheses.
#             print() # to add a line break after each entry.



for doc_id in question_combinations_per_document_similarity:
    combos = question_combinations_per_document[doc_id]
    sims = question_combinations_per_document_similarity[doc_id]

    for (q1, q2), similarity_score in zip(combos, sims):
        # Above gives e.g.: 0 1 0.6864653957328704 \n  0 2 0.7297580536314496  \n  1 2 0.8621906991184569
        # Now also get the corresponding questions for review.
        doc_idx = index_for_questions_with_embeddings[doc_id]
        # optional guard if keys might be missing
        if q1 not in doc_idx or q2 not in doc_idx:
            continue
        q1_question = doc_idx[q1]['question']
        q2_question = doc_idx[q2]['question']
        if similarity_score > 0.85 and q1_question != q2_question: # we already looked at exact duplicates. to keep it shorter here.
            pprint((f"{similarity_score:.2f}", q1_question, q2_question)) # pprint expects a single argument. thus the double parentheses.
            print() # to add a line break after each entry.


('0.86',
 'Was Bitcoin v0.1 released on Jan 8, 2009?',
 'When was the v0.1 release of Bitcoin?')

('0.85',
 "How did Satoshi Nakamoto protect users' IP addresses when using the Tor "
 'proxy in Bitcoin version 0.2?',
 "When was Satoshi's TOR proxy feature incorporated into Bitcoin version 0.2?")

('0.87',
 'Who asked Satoshi Nakamoto for help compiling and installing Bitcoin on '
 'Linux?',
 "How did the community respond to Satoshi's request for instructions on "
 'compiling and installing Bitcoin on Linux?')

('0.86',
 'What is the Linux version of Bitcoin scheduled to be released in?',
 'Is version 0.2 of Bitcoin the next release with a functional Linux port?')

('0.90',
 'What options did users of wxWidgets use to build or configure the software?',
 'Can you explain some of the options used for building or configuring '
 'wxWidgets?')

('0.86',
 'Was malloc.h necessary to include before committing changes to headers.h for '
 'Satoshi?',
 'At what point did Satoshi commit the change

In [57]:
question_combinations_per_document_similarity

{0: [np.float64(0.6864653957328704),
  np.float64(0.7297580536314496),
  np.float64(0.8621906991184569)],
 1: [np.float64(0.7372945700109843),
  np.float64(0.7226673041572994),
  np.float64(0.6691826795416889)],
 2: [np.float64(0.6252358060466829),
  np.float64(0.7535434769399083),
  np.float64(0.6033944987985799)],
 3: [np.float64(0.5412407430729286),
  np.float64(0.6323084044868046),
  np.float64(0.7095599369844926),
  np.float64(0.4661963160794194),
  np.float64(0.5927520720184083),
  np.float64(0.6115515713998052),
  np.float64(0.6771574094335887),
  np.float64(0.6965797505867608),
  np.float64(0.4848398514987524),
  np.float64(0.5736278494777782)],
 4: [np.float64(0.7530179182813821),
  np.float64(0.5949991175785313),
  np.float64(0.5752895572678625),
  np.float64(0.6373704021917608),
  np.float64(0.677849364565931),
  np.float64(0.604115347625484),
  np.float64(0.7091644505872232),
  np.float64(0.6774494789736804),
  np.float64(0.5780927371619429),
  np.float64(0.5555108873281875