In [6]:
import pandas as pd
import ast

In [17]:
documents =pd.read_csv('data/satoshi_with_questions.csv')

documents.head()

Unnamed: 0,content,chunk_index,collection,content_type,filename,sequence_number,source,source_file,title,total_chunks,useless_to_recall,questions
0,"Emails\nBitcoin P2P e-cash paperOct 31, 2008, ...",0,emails,email,email_002_Cryptography_Mailing_List,2,nakamotoinstitute_files\emails\email_002_Crypt...,nakamotoinstitute_files\emails\email_002_Crypt...,Cryptography Mailing List,1,False,"[""What is mentioned with regards to Bitcoin's ..."
1,Emails\n[p2p-research] Bitcoin open source imp...,0,emails,email,email_004_P2P_Research_List,4,nakamotoinstitute_files\emails\email_004_P2P_R...,nakamotoinstitute_files\emails\email_004_P2P_R...,P2P Research List,1,False,"[""What was the release date of Bitcoin's open ..."
2,From: | Satoshi Nakamoto | Subject: | Bitcoin ...,0,emails,email,email_010_Bitcoin_P2P_e-cash_paper,10,nakamotoinstitute_files\emails\email_010_Bitco...,nakamotoinstitute_files\emails\email_010_Bitco...,Bitcoin P2P e-cash paper,1,False,"[""What is Satoshi Nakamoto's view on using cry..."
3,From: | Satoshi Nakamoto | Subject: | [bitcoin...,0,emails,email,email_022_[bitcoin-list]_Welcome,22,nakamotoinstitute_files\emails\email_022_[bitc...,nakamotoinstitute_files\emails\email_022_[bitc...,[bitcoin-list] Welcome,1,False,['Was Bitcoin introduced as a form of payment?...
4,From: | Satoshi Nakamoto | Subject: | [bitcoin...,0,emails,email,email_024_[bitcoin-list]_Bitcoin_v0.1.2_now_av...,24,nakamotoinstitute_files\emails\email_024_[bitc...,nakamotoinstitute_files\emails\email_024_[bitc...,[bitcoin-list] Bitcoin v0.1.2 now available,1,False,"[""Was the development of Bitcoin affected by t..."


In [9]:
documents.questions.iloc[0][0]

'['

In [None]:
# Common problem I had in GraphViewer project as well: Once the list of strings were stored as csv it was a a string.
# So we need to parse it back to a list of strings.

ast.literal_eval(documents.questions.iloc[0])

["What is mentioned with regards to Bitcoin's version number in these emails?",
 'Was Bitcoin v0.1 released on Jan 8, 2009?',
 'When was the v0.1 release of Bitcoin?']

In [None]:
# Helper function from Grok 
# there were some malformed strings that cause regular literal_eval to fail.
def safe_literal_eval(s):
    """Safely evaluate a string as a Python literal, with fallback options."""
    try:
        return ast.literal_eval(s)
    except (ValueError, SyntaxError):
        # Try to fix common issues
        try:
            # Replace double quotes with single quotes for consistency
            fixed_s = s.replace('""', "'").replace('"', "'")
            return ast.literal_eval(fixed_s)
        except (ValueError, SyntaxError):
            try:
                # Try to parse as JSON if it's valid JSON
                import json
                return json.loads(s)
            except (ValueError, json.JSONDecodeError):
                # Last resort: return the string as a single-item list
                print(f"Warning: Could not parse: {s[:50]}...")
                return [str(s)]

documents['questions'] = documents['questions'].apply(safe_literal_eval)

  Invali...
  Invali...


## Manually inspect exact duplicates

In [30]:
# documents['questions'] = documents['questions'].apply(ast.literal_eval)


all_questions = []
all_questions_index = [] # New variable to store the original row index for each question
for i, question_entry in enumerate(documents['questions']):
    # The 'questions' column is expected to contain lists of strings after
    # the safe_literal_eval application. However, if there were issues
    # or if some entries remained as string representations of lists,
    # `extend()` would iterate over characters of the string.
    # To ensure robustness, we re-apply `safe_literal_eval` to each entry.
    # This guarantees that `parsed_questions` is always a list of strings,
    # even if `question_entry` was a string representation of a list,
    # or a single unparseable string (which safe_literal_eval wraps in a list).
    # We cast to `str()` first, as `safe_literal_eval` expects a string input.
    parsed_questions = safe_literal_eval(str(question_entry))
    all_questions.extend(parsed_questions)
    # For each question added, record its original row index
    all_questions_index.extend([i] * len(parsed_questions)) # to add an entry for every question in the list


# Calculate statistics
total_questions = len(all_questions)
unique_questions = len(set(all_questions))
duplicate_count = total_questions - unique_questions

print(f"Total questions: {total_questions}")
print(f"Unique questions: {unique_questions}")
print(f"Duplicate questions: {duplicate_count}") # "Hardcore" duplicates 


  Invali...
  Invali...
Total questions: 3601
Unique questions: 3573
Duplicate questions: 28


In [32]:
# Build a dataframe of all questions and their originating row index
df_q = pd.DataFrame({
    'question': all_questions,
    'doc_index': all_questions_index
})

# Keep all rows whose question appears more than once
duplicate_questions_df = df_q[df_q.duplicated('question', keep=False)].copy()

# Fetch the corresponding content using positional index (matches enumerate)
duplicate_questions_df['content'] = duplicate_questions_df['doc_index'].apply(lambda i: documents.iloc[i]['content'])

# Optional: sort for easier review
duplicate_questions_df = duplicate_questions_df.sort_values(['question', 'doc_index']).reset_index(drop=True)

duplicate_questions_df

Unnamed: 0,question,doc_index,content
0,,206,"Right, that is quite a bit better.Can you give..."
1,,835,"CPU power gangs up on one.Â Instead, all netw..."
2,Did Satoshi Nakamoto discuss peer-to-peer netw...,115,"davidonpda, were you also running laszlo's bui..."
3,Did Satoshi Nakamoto discuss peer-to-peer netw...,797,Code:diff -u old\main.cpp new\main.cpp--- old\...
4,What did Satoshi Nakamoto say about micropayme...,545,". We have to trust them with\nour privacy, tr..."
5,What did Satoshi Nakamoto say about micropayme...,727,Forgot to add the good part about micropayment...
6,What did Satoshi Nakamoto say about peer-to-pe...,17,Could be. They're talking about the old Chaumi...
7,What did Satoshi Nakamoto say about peer-to-pe...,156,The design outlines a lightweight client that ...
8,What did Satoshi Nakamoto say about peer-to-pe...,212,Gavin's changes look good.Â I think everythin...
9,What did Satoshi Nakamoto say about peer-to-pe...,439,Bitcoin Design\nWe have proposed a system for ...


## Using Embeddings and Cosine Similarity

In [None]:
from dotenv import load_dotenv
load_dotenv()

# from langfuse.openai import AsyncOpenAI
from openai import AsyncOpenAi
from langfuse import observe

from diskcache_decorator import cached

In [2]:
client = AsyncOpenAI(
    base_url="http://localhost:11434/v1",  # Ollama's default local endpoint
    api_key="ollama"  # Dummy key, required but not used by Ollama
)

EMBED_MODEL = "mxbai-embed-large:latest"

In [None]:
# test call

# note that the main difference is we provide "input" instead of "message"
embedding = await client.embeddings.create(
    input="Hello, world!",
    model=EMBED_MODEL,
)

embedding.data[0].embedding[0]  # first numer to not spam screen

0.017286297

In [14]:
@cached()
@observe(as_type="generation")
async def make_cached_traced_embedding(input):
    embedding = await client.embeddings.create(
    input=input,
    model=EMBED_MODEL,
)
    return embedding.data[0].embedding

In [None]:
embedding = await make_cached_traced_embedding("blubb")
embedding[0]

[0.010233218,
 -0.038600657,
 0.009757531,
 0.027120082,
 -0.009036253,
 -0.02270763,
 0.015169542,
 0.02873053,
 0.019938942,
 0.004861243,
 0.015461836,
 -0.012948609,
 -0.012042771,
 -0.008697443,
 0.0012249082,
 0.018399006,
 -0.038697235,
 -0.03323143,
 -0.036738977,
 -0.0095467735,
 0.022397714,
 -0.0023021,
 -0.083187714,
 0.0406556,
 -0.0011044745,
 0.033370264,
 -9.205897e-05,
 0.009889028,
 0.063418634,
 0.06738471,
 0.014880561,
 -0.024583112,
 0.024596564,
 -0.06547182,
 -0.021018432,
 -0.0060950806,
 0.012801508,
 -0.019808754,
 -0.012101464,
 -0.024820216,
 0.015533624,
 -0.01566893,
 0.043049376,
 -0.039670505,
 -0.02906719,
 -0.018088581,
 -0.037176438,
 -0.030798793,
 0.034067564,
 -0.060578868,
 -0.045236975,
 0.02752229,
 0.015319432,
 -0.0049959575,
 -0.008969736,
 -0.0075763008,
 -0.011640021,
 0.002754471,
 0.008852839,
 0.03442849,
 0.05464797,
 -0.0033712513,
 0.036154762,
 -0.032057438,
 0.0028879775,
 0.015582766,
 -0.011349066,
 0.025634324,
 0.012916364,
 0.