# Semantle Notebook

In [62]:
import os
import requests
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from gensim.models import KeyedVectors
from dotenv import load_dotenv
from openai import OpenAI

## Get Past Semantles

In [None]:
past_semantles = []
nearby_words = []
num_games = 0
for i in range(1171):
  url = f"https://server.semantle.com/semantle/game-and-nearest/{i}/en"
  response = requests.get(url)
  try:
    data = json.loads(response.text)
  except json.JSONDecodeError:
    print(f"Error decoding JSON for game ID {i}: {response.text}")
    continue
  num_games += 1
  past_semantles.append(data['word'])
  for near in data['nearby']:
    nearby_words.append(near)

semantle_df = pd.DataFrame({
  'game_id': range(1, num_games+1),
  'answer': past_semantles,
  'nearby_1': [nearby_words[i] for i in range(0, len(nearby_words), 10)],
  'nearby_2': [nearby_words[i] for i in range(1, len(nearby_words), 10)],
  'nearby_3': [nearby_words[i] for i in range(2, len(nearby_words), 10)],
  'nearby_4': [nearby_words[i] for i in range(3, len(nearby_words), 10)],
  'nearby_5': [nearby_words[i] for i in range(4, len(nearby_words), 10)],
  'nearby_6': [nearby_words[i] for i in range(5, len(nearby_words), 10)],
  'nearby_7': [nearby_words[i] for i in range(6, len(nearby_words), 10)],
  'nearby_8': [nearby_words[i] for i in range(7, len(nearby_words), 10)],
  'nearby_9': [nearby_words[i] for i in range(8, len(nearby_words), 10)],
  'nearby_10': [nearby_words[i] for i in range(9, len(nearby_words), 10)]
})
semantle_df.to_csv('semantle.csv', index=False)
display(semantle_df)

In [None]:
model_google = api.load('word2vec-google-news-300') #w2v used for sim score in semantle
model_google.similarity('happiness', 'oceanography')

0.060215086

In [17]:
row = semantle_df.iloc[0]
for i in range(1, 10):
  print(f"Game {row['game_id']}: {row['answer']}")
  print(f"Nearby word {i}: {row[f'nearby_{i}']}")
  print(f"Similarity: {model_google.similarity(row['answer'], row[f'nearby_{i}'])}")

Game 1: forever
Nearby word 1: forevermore
Similarity: 0.6128401160240173
Game 1: forever
Nearby word 2: eternally
Similarity: 0.5703619122505188
Game 1: forever
Nearby word 3: irrevocably
Similarity: 0.5341606140136719
Game 1: forever
Nearby word 4: eternity
Similarity: 0.5133062601089478
Game 1: forever
Nearby word 5: indelibly
Similarity: 0.5070809125900269
Game 1: forever
Nearby word 6: permanently
Similarity: 0.49701961874961853
Game 1: forever
Nearby word 7: everlasting
Similarity: 0.4943772554397583
Game 1: forever
Nearby word 8: eons
Similarity: 0.4937649071216583
Game 1: forever
Nearby word 9: irreversibly
Similarity: 0.4742991328239441


## ChatGPT Plays Semantle

In [17]:
semantle_df = pd.read_csv('semantle.csv', index_col=0)
display(semantle_df.head())

Unnamed: 0_level_0,answer,nearby_1,nearby_2,nearby_3,nearby_4,nearby_5,nearby_6,nearby_7,nearby_8,nearby_9,nearby_10
game_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,forever,forevermore,eternally,irrevocably,eternity,indelibly,permanently,everlasting,eons,irreversibly,forgotten
2,executive,director,chairman,president,exec,chairperson,directors,vp,manager,honcho,chairwoman
3,elevator,elevators,escalator,stairwell,staircase,stairway,escalators,stairs,stairwells,dumbwaiter,stairways
4,patience,perseverance,fortitude,persevere,humility,prudence,diligence,tenacity,persistence,impatience,courage
5,overnight,morning,midday,afternoon,midmorning,midafternoon,day,night,hours,evening,daybreak


In [5]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

In [12]:
model_google.most_similar('bread', topn=256)

[('butter', 0.641726016998291),
 ('rye_sourdough', 0.6290417313575745),
 ('breads', 0.6243128180503845),
 ('loaf', 0.6184971928596497),
 ('flour', 0.615212619304657),
 ('baladi_bread', 0.6061378717422485),
 ('loaves', 0.6045446991920471),
 ('raisin_bread', 0.5843342542648315),
 ('stale_bread', 0.5802395343780518),
 ('wheaten_flour', 0.5785929560661316),
 ('french_bread', 0.5768002271652222),
 ('pita_bread', 0.5748327374458313),
 ('pasta', 0.574478268623352),
 ('marble_rye', 0.5722653865814209),
 ('mandazi', 0.5714545249938965),
 ('wholewheat', 0.5712084770202637),
 ('Bread', 0.5707091689109802),
 ('baguettes', 0.5698294639587402),
 ('Challah_bread', 0.5683927536010742),
 ('Grupo_Bimbo_SAB', 0.5638784766197205),
 ('cinnamon_raisin_bread', 0.5588816404342651),
 ('baguette', 0.5556364059448242),
 ('NASS_surveyed', 0.5551199316978455),
 ('zucchini_bread', 0.5546611547470093),
 ('Rahmatullah_Mehsud', 0.5541609525680542),
 ('toasted_bagels', 0.5523495078086853),
 ('flatbread', 0.552343845367

In [52]:
client = OpenAI(api_key=api_key)
description = "You are an assistant configured to solve a game of Semantle."
init_prompt = """
You are an assistant configured to solve a game of Semantle.
Your objective is to guess the secret word based on semantic similarity in as few guesses as possible.
You will provide a list of 10 words to guess, and will continue until you find the secret word.

You may assume the following rules:
1. Semantle will inform you how semantically similar your guess is to the secret word.
2. Unlike other word games, this game is not about spelling; it's about meaning.
3. Word similarity is calculated using word2vec.
4. Once you get within one thousand words of the secret word, we will tell you in the proximity column.
5. You will be provided a list of words you previouslly guessed and their similarity scores. Do not repeat guesses.
6. You have unlimited guesses, but the goal is to find the secret word in as few guesses as possible so be adventurous with your guesses.

Please output your response in a JSON format with the following structure:
{
  "guess": "guess1_here, guess2_here, ..., guess10_here"
}

Good luck!
"""

last_guess = None
last_sim = None
last_rank = None
last_guess_prompt = f"""\n
Last guess: {last_guess}
Last similarity: {last_sim}
Last rank: {last_rank}
"""

previous_guesses = []
previous_guesses_sim = []
previous_guesses_percentile = []
previous_guesses_str = "No previous guesses yet."
previous_guesses_prompt = f"""\n
Previous guesses (word, similarity, rank if applicable):
{previous_guesses_str}
"""

In [44]:
def is_ranked(guess, answer, topn=1000):
  """
  Check if a guess is within the top N most similar words to the answer.

  Args:
    guess (str): Guess
    answer (str): Secret word/answer
    topn (int): Number of top similar words to consider
  
  Returns:
    bool: True if guess is within top N similar words, False otherwise
  """
  try:
    similar_words = model_google.most_similar(positive=[answer], topn=topn)
    i = 0
    for word, _ in similar_words:
      i += 1
      if word == guess:
        print(i)
        return True
  except KeyError: # If the word is not in the model's vocabulary
    return False
  return False

In [53]:
answer = semantle_df.iloc[0]['answer']
last_guess = None
last_sim = None
last_rank = None
last_guess_prompt = f"""\n
Last guess: {last_guess}
Last similarity: {last_sim}
Last rank: {last_rank}
"""

previous_guesses = []
previous_guesses_sim = []
previous_guesses_percentile = []
previous_guesses_str = "No previous guesses yet."
previous_guesses_prompt = f"""\n
Previous guesses (word, similarity, rank if applicable):
{previous_guesses_str}
"""
for a in range(10):
  prompt = init_prompt
  # if last_guess is not None:
  #   prompt += last_guess_prompt
  prompt += previous_guesses_prompt
  print(prompt)
  response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
      {"role": "system", "content": description},
      {"role": "user", "content": prompt}
    ],
    max_completion_tokens=500,
    response_format={ "type": "json_object" }
  )
  response = response.choices[0].message.content
  response = json.loads(response)
  guess_list = response['guess'].split(', ')
  for guess in guess_list:
    last_guess = guess
    print(f"Assistant's guess: {last_guess}")
    
    if last_guess == answer:
      print("Congratulations! The assistant has guessed the secret word!")
      break
    last_sim = model_google.similarity(last_guess, answer)
    last_rank = is_ranked(last_guess, answer)

    # add to previous guesses, sort
    previous_guesses.append(last_guess)
    previous_guesses_sim.append(last_sim)
    previous_guesses_percentile.append(next((i + 1 for i, (word, _) in enumerate(model_google.most_similar(positive=[answer], topn=1000)) if word == last_guess), -1))

    # sort lists by similarity score
    sorted_indices = np.argsort(previous_guesses_sim)[::-1]
    previous_guesses = [previous_guesses[i] for i in sorted_indices]
    previous_guesses_sim = [previous_guesses_sim[i] for i in sorted_indices]
    previous_guesses_percentile = [previous_guesses_percentile[i] for i in sorted_indices]

    # update previous guesses string
    previous_guesses_str = "\n".join([f"{word} (sim: {sim:.4f}, rank: {rank})" for word, sim, rank in zip(previous_guesses, previous_guesses_sim, previous_guesses_percentile)])
    previous_guesses_prompt = f"""\n
Previous guesses (word, similarity, rank if applicable):
{previous_guesses_str}
"""


You are an assistant configured to solve a game of Semantle.
Your objective is to guess the secret word based on semantic similarity in as few guesses as possible.
You will provide a list of 10 words to guess, and will continue until you find the secret word.

You may assume the following rules:
1. Semantle will inform you how semantically similar your guess is to the secret word.
2. Unlike other word games, this game is not about spelling; it's about meaning.
3. Word similarity is calculated using word2vec.
4. Once you get within one thousand words of the secret word, we will tell you in the proximity column.
5. You will be provided a list of words you previouslly guessed and their similarity scores. Do not repeat guesses.
6. You have unlimited guesses, but the goal is to find the secret word in as few guesses as possible so be adventurous with your guesses.

Please output your response in a JSON format with the following structure:
{
  "guess": "guess1_here, guess2_here, ..., guess1

KeyError: "Key 'cherishment' not present"

In [43]:
last_rank

True

In [60]:
import re

def clean_chunks(chunks, min_words=3):
    cleaned = []
    for chunk in chunks:
        # Remove leading/trailing whitespace
        stripped = chunk.strip()
        
        # Skip if empty after stripping
        if not stripped:
            continue
        
        # Skip if original starts with whitespace or control character
        if re.match(r'^\s', chunk):
            continue
        
        # Skip if too few words
        if len(stripped.split()) < min_words:
            continue
        
        cleaned.append(stripped)
    return cleaned

In [61]:
# get text from https://www.npr.org/2025/04/25/nx-s1-5377395/hegseth-defense-department
import requests
from bs4 import BeautifulSoup
response = requests.get("https://www.npr.org/2025/04/23/nx-s1-5339833/young-democrats-trump-primary-challenges-midterm-elections")
soup = BeautifulSoup(response.content, 'html.parser')
text = soup.get_text()
# get all paragraphs <p>
paragraphs = soup.find_all('p')
cleaned = clean_chunks([para.get_text() for para in paragraphs], min_words=10)
for i, chunk in enumerate(cleaned):
  print(chunk)

Elijah Manley was still a teenager when his frustration with President Trump pushed him to get involved in politics. Today, he's finally old enough to run for Congress. Upset with how his own Democratic Party is responding to Trump, he's decided to do just that.
Manley is a 26-year-old substitute history teacher based in Fort Lauderdale, Florida. He's never held elected office but has spent years organizing with local and national progressive groups.
He says he's grown tired of watching Democrats, in his view, fail to be aggressive enough in their response to Trump. So in February, Manley launched a primary challenge against incumbent Democrat Sheila Cherfilus-McCormick for Florida's 20th District.
"We're stuck in this era right now," he said. "What's going to fix that is thinking bigger and being bolder and we're not seeing that in the Democratic Party."
With his leap into the race, Manley joins a growing group of at least five Gen Z and millennials under 40 who have recently launched

In [46]:
model_google.most_similar(positive=[answer], topn=623)

[('forevermore', 0.6128400564193726),
 ('eternally', 0.5703619122505188),
 ('Mikio_Tadano_bristles', 0.5592326521873474),
 ('Ramallah_Jenin_Nablus', 0.557643473148346),
 ('Jadis_icy_spell', 0.5450044870376587),
 ('irrevocably', 0.5341606140136719),
 ('Eldor_Schuman', 0.5220792293548584),
 ('Menemon', 0.521329402923584),
 ('nearby_Lottsburg', 0.5166890621185303),
 ('eternity', 0.513306200504303),
 ('awesome_Torretta', 0.5093652009963989),
 ('indelibly', 0.5070808529853821),
 ('Vive_le_Tour', 0.5066319704055786),
 ('Agha_Lalai_member', 0.49967125058174133),
 ('permanently', 0.4970196485519409),
 ('everlasting', 0.4943772554397583),
 ('eons', 0.4937649369239807),
 ('Bassem_Naim_Hamas', 0.487536758184433),
 ('Rini_Paiva_spokeswoman', 0.4802810549736023),
 ('irreversibly', 0.4742991626262665),
 ('lasts_forever', 0.4676705300807953),
 ('ignorant_bliss', 0.4652673900127411),
 ('doubtless_precede', 0.4642271399497986),
 ('etched_indelibly', 0.46410104632377625),
 ('forgotten', 0.45924186706542

In [None]:


correct_word = semantle_df.iloc[0]['answer']
print(f"Correct word: {correct_word}")
print(f"Similarity to correct word: {model_google.similarity("evermore", correct_word)}")
print(f"Is ranked in top 1000: {is_ranked("forevermore", correct_word)}")

Correct word: forever
Similarity to correct word: 0.31351256370544434
Is ranked in top 1000: True


In [25]:
model_google.most_similar('forever', topn=1000)

[('forevermore', 0.6128400564193726),
 ('eternally', 0.5703619122505188),
 ('Mikio_Tadano_bristles', 0.5592326521873474),
 ('Ramallah_Jenin_Nablus', 0.557643473148346),
 ('Jadis_icy_spell', 0.5450044870376587),
 ('irrevocably', 0.5341606140136719),
 ('Eldor_Schuman', 0.5220792293548584),
 ('Menemon', 0.521329402923584),
 ('nearby_Lottsburg', 0.5166890621185303),
 ('eternity', 0.513306200504303),
 ('awesome_Torretta', 0.5093652009963989),
 ('indelibly', 0.5070808529853821),
 ('Vive_le_Tour', 0.5066319704055786),
 ('Agha_Lalai_member', 0.49967125058174133),
 ('permanently', 0.4970196485519409),
 ('everlasting', 0.4943772554397583),
 ('eons', 0.4937649369239807),
 ('Bassem_Naim_Hamas', 0.487536758184433),
 ('Rini_Paiva_spokeswoman', 0.4802810549736023),
 ('irreversibly', 0.4742991626262665),
 ('lasts_forever', 0.4676705300807953),
 ('ignorant_bliss', 0.4652673900127411),
 ('doubtless_precede', 0.4642271399497986),
 ('etched_indelibly', 0.46410104632377625),
 ('forgotten', 0.45924186706542