# NYTConnections Scrapper

**Author:** Eric Nunes

**Originally Created:** 11 October 2024

This notebook aims to scrape the information from the NYT Connections games and post regular updates to the corresponding dataset. You can find the Kaggle dataset [here](https://www.kaggle.com/datasets/eric27n/the-new-york-times-connections/data).

In [1]:
import numpy as np
import pandas as pd
import requests
import json
from datetime import datetime, timedelta

In [2]:
def reformat_date(date_str):
    try:
        original_date = datetime.strptime(date_str, "%m/%d/%Y")
        new_date_str = original_date.strftime("%Y-%m-%d")
        return new_date_str
    except ValueError:
        return date_str

In [None]:
today_date = datetime.now().date()
yesterday_date = today_date - timedelta(1)
tomorrow_date = today_date + timedelta(1)
datetimes = [yesterday_date, today_date, tomorrow_date] # adjust datetimes as needed

start_date = "2024-11-01"
end_date = "2024-11-15"
date_range = pd.date_range(start=start_date, end=end_date, freq="D").tolist()
datetimes = [date.date() for date in date_range]
print(datetimes)

df = pd.read_csv("hf://datasets/eric27n/NYT-Connections/Connections_Data.csv")

game_ids = []
dates = []
words = []
group_names = []
word_levels = []
rows = []
columns = []

[datetime.date(2024, 11, 1), datetime.date(2024, 11, 2), datetime.date(2024, 11, 3), datetime.date(2024, 11, 4), datetime.date(2024, 11, 5), datetime.date(2024, 11, 6), datetime.date(2024, 11, 7), datetime.date(2024, 11, 8), datetime.date(2024, 11, 9), datetime.date(2024, 11, 10), datetime.date(2024, 11, 11), datetime.date(2024, 11, 12), datetime.date(2024, 11, 13), datetime.date(2024, 11, 14), datetime.date(2024, 11, 15)]


In [21]:
# Editor's Note 2024-10-26:
#     A Row where the word was "NA" was improperly read as NaN (not a number/null).
#     This row is to rectify this problem.
display(df[df['Word'].isnull()])
df.loc[978, 'Word'] = "NA"
display(df[df['Word'].isnull()])
display(df[df['Game ID'] == 62])
print(len(df))

Unnamed: 0,Game ID,Puzzle Date,Word,Group Name,Group Level,Starting Row,Starting Column
978,62,2023-08-12,,PERIODIC TABLE SYMBOLS,3,1,3


Unnamed: 0,Game ID,Puzzle Date,Word,Group Name,Group Level,Starting Row,Starting Column


Unnamed: 0,Game ID,Puzzle Date,Word,Group Name,Group Level,Starting Row,Starting Column
976,62,2023-08-12,MA,STATE ABBREVIATIONS,0,1,1
977,62,2023-08-12,LA,MUSICAL NOTES,1,1,2
978,62,2023-08-12,,PERIODIC TABLE SYMBOLS,3,1,3
979,62,2023-08-12,NU,GREEK LETTERS,2,1,4
980,62,2023-08-12,PI,GREEK LETTERS,2,2,1
981,62,2023-08-12,NI,PERIODIC TABLE SYMBOLS,3,2,2
982,62,2023-08-12,TI,MUSICAL NOTES,1,2,3
983,62,2023-08-12,HE,PERIODIC TABLE SYMBOLS,3,2,4
984,62,2023-08-12,FA,MUSICAL NOTES,1,3,1
985,62,2023-08-12,XI,GREEK LETTERS,2,3,2


8304


In [22]:
df['Puzzle Date'] = df['Puzzle Date'].apply(reformat_date)
display(df)

Unnamed: 0,Game ID,Puzzle Date,Word,Group Name,Group Level,Starting Row,Starting Column
0,1,2023-06-12,SNOW,WET WEATHER,0,1,1
1,1,2023-06-12,LEVEL,PALINDROMES,3,1,2
2,1,2023-06-12,SHIFT,KEYBOARD KEYS,2,1,3
3,1,2023-06-12,KAYAK,PALINDROMES,3,1,4
4,1,2023-06-12,HEAT,NBA TEAMS,1,2,1
...,...,...,...,...,...,...,...
8299,547,2024-11-11,PAYDAY,CHOCOLATE BARS,0,3,4
8300,547,2024-11-11,CORE,GIST,1,4,1
8301,547,2024-11-11,DOVE,CHOCOLATE BARS,0,4,2
8302,547,2024-11-11,LADY,___ BIRD,3,4,3


In [23]:
# Print entries for a given date
date = "2024-10-31" # change date as you please
filtered_df = df[df['Puzzle Date'] == date]
result = [row for row in filtered_df['Word'].values]
print(",".join(result))
display(filtered_df)

BAT,EGG,LOVE,WITCH,PUMPKIN,SPIDER,BLINK,JOB,DIABLO,FLUTTER,TREASURE,VIPER,SCAVENGER,MUSTANG,WINK,DARLING


Unnamed: 0,Game ID,Puzzle Date,Word,Group Name,Group Level,Starting Row,Starting Column
8112,523,2024-10-31,BAT,THINGS YOU CAN DO WITH YOUR EYELIDS,1,1,1
8113,523,2024-10-31,EGG,___ HUNT,3,1,2
8114,523,2024-10-31,LOVE,TERMS OF ENDEARMENT,0,1,3
8115,523,2024-10-31,WITCH,___ HUNT,3,1,4
8116,523,2024-10-31,PUMPKIN,TERMS OF ENDEARMENT,0,2,1
8117,523,2024-10-31,SPIDER,SPORTS CARS,2,2,2
8118,523,2024-10-31,BLINK,THINGS YOU CAN DO WITH YOUR EYELIDS,1,2,3
8119,523,2024-10-31,JOB,___ HUNT,3,2,4
8120,523,2024-10-31,DIABLO,SPORTS CARS,2,3,1
8121,523,2024-10-31,FLUTTER,THINGS YOU CAN DO WITH YOUR EYELIDS,1,3,2


In [24]:
# Run data extraction for today and tomorrow dates.
for date in datetimes:
    url = f"https://www.nytimes.com/svc/connections/v1/{date}.json"
    response = requests.get(url)
    results = json.loads(response.text)
    
    game_ids.extend([results['id']] * 16)
    dates.extend([date.strftime("%Y-%m-%d")] * 16)
    
    starting_board = results['startingGroups']
    for x in range(1, 5):
        for y in range(1, 5):
            word = starting_board[x-1][y-1]
            words.append(word)
            rows.append(x)
            columns.append(y)
            for key in results['groups'].keys():
                if word in results['groups'][key]['members']:
                    group_names.append(key)
                    word_levels.append(results['groups'][key]['level'])
    
    today_df = pd.DataFrame({
        'Game ID': game_ids,
        'Puzzle Date': dates,
        'Word': words,
        'Group Name': group_names,
        'Group Level': word_levels,
        'Starting Row': rows,
        'Starting Column': columns
    })
    
    df = pd.concat([df, today_df], ignore_index=True)
display(df.tail(20))

Unnamed: 0,Game ID,Puzzle Date,Word,Group Name,Group Level,Starting Row,Starting Column
10204,545,2024-11-14,EMPTY,DEPLETE,0,4,1
10205,545,2024-11-14,GROOVE,PLAY MUSIC WITH PASSION,1,4,2
10206,545,2024-11-14,TIP,WORDS ON A RESTAURANT RECEIPT,2,4,3
10207,545,2024-11-14,EXHAUST,DEPLETE,0,4,4
10208,561,2024-11-15,PIPE,INSTRUMENTS YOU BLOW INTO,2,1,1
10209,561,2024-11-15,PHRASE,ELEMENTS OF WRITING,1,1,2
10210,561,2024-11-15,JUG,INSTRUMENTS YOU BLOW INTO,2,1,3
10211,561,2024-11-15,PASSAGE,___ OF TIME,3,1,4
10212,561,2024-11-15,LEAK,FISSURE,0,2,1
10213,561,2024-11-15,LETTER,ELEMENTS OF WRITING,1,2,2


In [27]:
import pandas as pd

# Count occurrences of each "Puzzle Date"
puzzle_date_counts = df['Puzzle Date'].value_counts()

# Filter dates with less than or more than 16 occurrences
filtered_dates = puzzle_date_counts[(puzzle_date_counts < 16) | (puzzle_date_counts > 16)]

# Print the filtered dates
print(filtered_dates)
display(df[df['Puzzle Date'] == "2024-11-11"])

Puzzle Date
2024-11-11    20
Name: count, dtype: int64


Unnamed: 0,Game ID,Puzzle Date,Word,Group Name,Group Level,Starting Row,Starting Column
8288,547,2024-11-11,CRUNCH,CHOCOLATE BARS,0,1,1
9344,547,2024-11-11,CRUNCH,CANDY BARS,0,1,1
8289,547,2024-11-11,SUBSTANCE,GIST,1,1,2
8290,547,2024-11-11,DEMI,KINDS OF BRAS,2,1,3
8291,547,2024-11-11,SUE,___ BIRD,3,1,4
8292,547,2024-11-11,BIG,___ BIRD,3,2,1
8293,547,2024-11-11,THRUST,GIST,1,2,2
8294,547,2024-11-11,SPORTS,KINDS OF BRAS,2,2,3
8295,547,2024-11-11,PUSH-UP,KINDS OF BRAS,2,2,4
8296,547,2024-11-11,MARS,CHOCOLATE BARS,0,3,1


In [28]:
# Additional verification: remove any possible duplicates, sort dataset.
print(len(df))
df = df.drop_duplicates()
df = df.sort_values(by=['Puzzle Date', 'Starting Row', 'Starting Column'])
remove_df = (df['Puzzle Date'] == "2024-11-11") & (df['Group Name'] == "CHOCOLATE BARS")
df = df[~remove_df]
print(len(df))

8372
8368


In [29]:
# Save, update Kaggle dataset
df.to_csv('Connections_Data.csv', index=False)