In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

model_name = "bert-base-uncased"  # TODO: Try different models
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad(): # we are not training the model, so we don't need gradients
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
    return embedding




In [None]:
def calculate_cosine_similarity(title1, title2):
    embedding1 = get_embedding(title1)
    embedding2 = get_embedding(title2)
    embedding1 = embedding1.reshape(1, -1)
    embedding2 = embedding2.reshape(1, -1)

    similarity = cosine_similarity(embedding1, embedding2)[0][0]
    return similarity

# Example usage
title1 = "Understanding Machine Learning"
title2 = "Introduction to Deep Learning"
similarity = calculate_cosine_similarity(title1, title2)
print(f"Cosine Similarity: {similarity}")

Cosine Similarity: 0.7934294939041138


In [3]:
article1= "14th_century"
article2= "African_slave_trade"
similarity = calculate_cosine_similarity(article1, article2)
print(f"Cosine Similarity: {similarity}")

Cosine Similarity: 0.6303218603134155


In [7]:
article1= "14th_century"
article2= "15th_century"
similarity = calculate_cosine_similarity(article1, article2)
print(f"Cosine Similarity: {similarity}")

Cosine Similarity: 0.9655939936637878


In [8]:
article1_2= "14th_century"
article2_2= "Ottoman_Empire"
similarity = calculate_cosine_similarity(article1_2, article2_2)
print(f"Cosine Similarity: {similarity}")

Cosine Similarity: 0.7063847780227661


In [None]:
%pip install sentence_transformers

In [10]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_sbert_similarity(title1, title2):
    # Get embeddings
    embedding1 = model.encode(title1, convert_to_tensor=True)
    embedding2 = model.encode(title2, convert_to_tensor=True)
    # Calculate cosine similarity using SBERT's util function
    similarity = util.pytorch_cos_sim(embedding1, embedding2).item()
    return similarity



Collecting sentence_transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
   ---------------------------------------- 0.0/255.8 kB ? eta -:--:--
   - -------------------------------------- 10.2/255.8 kB ? eta -:--:--
   ---- ---------------------------------- 30.7/255.8 kB 330.3 kB/s eta 0:00:01
   --------- ----------------------------- 61.4/255.8 kB 409.6 kB/s eta 0:00:01
   ------------------ ------------------- 122.9/255.8 kB 654.9 kB/s eta 0:00:01
   ------------------------------ ------- 204.8/255.8 kB 888.4 kB/s eta 0:00:01
   --------------------------------- ---- 225.3/255.8 kB 860.2 kB/s eta 0:00:01
   -------------------------------------- 255.8/255.8 kB 826.7 kB/s eta 0:00:00
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-3.2.1
Note: you may need to restart the kernel to use updated packages.


In [11]:
# Example usage
title1_1 = "Understanding Machine Learning"
title2_1 = "Introduction to Deep Learning"
similarity = calculate_sbert_similarity(title1, title2)
print(f"Cosine Similarity with SBERT: {similarity}")

Cosine Similarity with SBERT: 0.5268170833587646


In [12]:
article1_1= "14th_century"
article2_1= "15th_century"
similarity = calculate_sbert_similarity(article1_1, article2_1)
print(f"Cosine Similarity with SBERT: {similarity}")

Cosine Similarity with SBERT: 0.9229943156242371


In [13]:
article1_3= "14th_century"
article2_3= "African_slave_trade"
similarity = calculate_sbert_similarity(article1_3, article2_3)
print(f"Cosine Similarity with SBERT: {similarity}")

Cosine Similarity with SBERT: 0.30906590819358826


In [1]:
import os
import sys

%pip install mistralai


from mistralai import Mistral

Note: you may need to restart the kernel to use updated packages.


In [7]:
api_key = "H1pJaiozF8BK9hkWc14qeSWCJll31t7U"
model = "mistral-large-latest"

client = Mistral(api_key=api_key)

stream_response = client.chat.stream(
    model = model,
    messages = [
        {
            "role": "user",
            "content": "What is the best French cheese?",
        },
    ]
)

for chunk in stream_response:
    print(chunk.data.choices[0].delta.content)


Det
erm
ining
 the
 "
best
"
 French
 cheese
 can
 be
 subject
ive
,
 as
 it
 largely
 depends
 on
 personal
 taste
.
 France
 is
 renown
ed
 for
 its
 wide
 variety
 of
 che
es
es
,
 with
 estimates
 suggesting
 there
 are
 over
 
1
,
0
0
0
 different
 types
.
 Here
 are
 a
 few
 highly
 regarded
 French
 che
es
es
 across
 various
 categories
:




1
.
 **
Soft
 Che
es
es
:**


  
 -
 **
B
rie
 de
 Me
aux
**:
 K
nown
 for
 its
 cream
y
 texture
 and
 rich
,
 earth
y
 flavor
.


  
 -
 **
Cam
ember
t
 de
 Norm
and
ie
**:
 A
 soft
,
 cream
y
 cheese
 with
 a
 strong
,
 distinctive
 taste
.




2
.
 **
S
emi
-
Soft
 Che
es
es
:**


  
 -
 **
M
orb
ier
**:
 Rec
ogn
izable
 by
 its
 layer
 of
 ash
 in
 the
 middle
,
 it
 has
 a
 nut
ty
 and
 fru
ity
 flavor
.


  
 -
 **
Reb
lo
ch
on
**:
 A
 sav
ory
 cheese
 from
 the
 Al
ps
 with
 a
 nut
ty
 taste
 and
 a
 soft
,
 washed
 r
ind
.




3
.
 **
Hard
 Che
es
es
:**


  
 -
 **
Com
té
**:
 A
 firm
,
 nut
ty
 cheese
 made
 from
 un
p
aste
ur


In [2]:
import pandas as pd
import time
import os



api_key = "H1pJaiozF8BK9hkWc14qeSWCJll31t7U"
model = "mistral-large-latest"

client = Mistral(api_key=api_key)

DATA_FOLDER = 'data/wikispeedia_paths-and-graph/'

# Read the data files
links = pd.read_csv(DATA_FOLDER + 'links.tsv', sep='\t', skiprows=11, names=['linkSource', 'linkTarget'])
paths_finished = pd.read_csv(DATA_FOLDER + 'paths_finished.tsv', sep='\t', skiprows=15, names=['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'rating'])

# Prepare the links dictionary for fast lookup
links_dict = links.groupby('linkSource')['linkTarget'].apply(list).to_dict()

# Initialize output data structures
llm_choices = []
llm_paths = []

run_id = 2  # Unique identifier for the run

print("Starting navigation...")

# Iterate over just one path in paths_finished.tsv for testing
for index, row in paths_finished.head(1).iterrows():
    path = row['path'].split(';')
    start_article = path[0]
    end_article = path[-1]

    current_article = start_article
    steps = 0
    path_taken = [current_article]

    print(f"Path: {start_article} -> {end_article}")

    while current_article != end_article:
        # Retrieve the links of the current article
        linked_articles = links_dict.get(current_article, [])

        if not linked_articles:
            print(f"No outgoing links from {current_article}.")
            break

        print(f"Step {steps}: {current_article}")
        print(f"Available links: {', '.join(linked_articles)}")

        # Prepare the prompt for the LLM
        prompt = f"You are navigating Wikipedia from '{start_article}' to '{end_article}'.\n" \
                 f"Currently at '{current_article}'.\n" \
                 f"Available links: {', '.join(linked_articles)}.\n" \
                 f"Which article would you like to visit next? Respond only with the article name."

        # Call the OpenAI API
        chat_completion = client.chat.complete(
            model=model,
            messages=[
                {"role": "user", "content": prompt}
            ],
            max_tokens=10,
            temperature=0,
            n=1
        )

        # Extract the LLM's choice
        choice = chat_completion.choices[0].message.content

        # Validate the choice
        if choice not in linked_articles:
            print(f"Invalid choice '{choice}' made by the LLM. Selecting the first link as fallback.")
            choice = linked_articles[0]

        # Record the choice
        llm_choices.append({
            'run_id': run_id,
            'article': current_article,
            'links': linked_articles,
            'link_chosen': choice
        })

        print(f"Selected link: {choice}")

        # Update the path and step
        current_article = choice
        path_taken.append(current_article)
        steps += 1

        # To comply with OpenAI rate limits
        time.sleep(10)

    # Record the path taken
    llm_paths.append({
        'run_id': run_id,
        'steps': steps,
        'path': ';'.join(path_taken)
    })

# Convert the results to DataFrames
llm_choices_df = pd.DataFrame(llm_choices)
llm_paths_df = pd.DataFrame(llm_paths)

# Save the results to TSV files
llm_choices_df.to_csv('tests/llm_choices.tsv', sep='\t', index=False)
llm_paths_df.to_csv('tests/llm_paths.tsv', sep='\t', index=False)

print("Navigation completed.")

Starting navigation...
Path: 14th_century -> African_slave_trade
Step 0: 14th_century
Available links: 13th_century, 15th_century, Abacus, Aztec, Black_Death, Buddha, China, Christianity, Dante_Alighieri, Dark_Ages, Edward_III_of_England, England, English_peasants%27_revolt_of_1381, Europe, France, Hundred_Years%27_War, Ibn_Battuta, India, Islam, Italy, Lithuania, Ming_Dynasty, Niger, Ottoman_Empire, Poland, Pope, Renaissance, Singapore, Time, Timur, Washington%2C_D.C.
Selected link: Ottoman_Empire
Step 1: Ottoman_Empire
Available links: 15th_century, 16th_century, 17th_century, 20th_century, Abbasid, Albania, Algeria, Arabic_language, Armenia, Asia, Atlantic_Ocean, Austria, Azerbaijan, Baghdad, Baroque, Beirut, Black_Sea, Bulgaria, Byzantine_Empire, Calligraphy, Caspian_Sea, Christopher_Columbus, Constitutional_monarchy, Crimean_War, Currency, Cyprus, Egypt, England, Europe, Folk_music, France, Greece, Greek_War_of_Independence, Guild, Habsburg_Spain, Holy_Roman_Empire, Indian_Ocean, 