# The 2022 Wordle Golf Summary

In [2]:
import operator
import datetime
import pandas as pd
from bokeh import plotting
from bokeh import models
from bokeh import transform
from dateutil import parser
from IPython.display import Markdown as md

plotting.output_notebook()

In [3]:
wordle_to_golf_score = {
        '1': -10,
        '2': -3,
        '3': -1,
        '4': 0,
        '5': 1,
        '6': 2,
        'X': 4,
        }

In [4]:
PlayerToColor = {                 
        'madre': '#f8961e',   
        'doug': '#277da1',    
        'daniel': '#f9c74f',
        'david': '#43aa8b',  
        'jamie': '#8e7dbe', 
        'lisa': '#f94144',     
        }               

def color_list_from_player_list(players):
    return [PlayerToColor[player] for player in players]

In [5]:
# First, let's load the wordle scores
scores_df = pd.read_csv("wordle_scores")

# And clean them up...
# Textra wrote out my scores as "me", let's get them back to "david"
scores_df.name = scores_df.name.replace(['me'],'david')

# Add a week column to the dataframe
starting_puzzle_number = 302
scores_df['week'] = (scores_df.puzzle_number - starting_puzzle_number) // 7

start_date = datetime.datetime(2022,4,17)
scores_df['date'] = scores_df.puzzle_number.apply(lambda pn: start_date + datetime.timedelta(days=pn-starting_puzzle_number))
# Filter out any days before 2023
scores_df = scores_df[scores_df['date'] < datetime.datetime(2023, 1, 1)]

# Map the wordle score to the corresponding golf score
scores_df['golf_score'] = scores_df['score'].map(wordle_to_golf_score)

In [6]:
# Build the data source into the right format
days_played_df = scores_df.name.value_counts().to_frame(name="days_played")
days_played_df.reset_index(inplace=True)
days_played_df = days_played_df.rename(columns = {'index':'name'})
source = models.ColumnDataSource(days_played_df)

# Create the figure
p = plotting.figure(title="How many days have we played?", 
                    x_range=source.data['name'],
                    x_axis_label="", 
                    y_axis_label="Number of days played", 
                    toolbar_location=None, tools="")
# Populate it with the data and labels
p.vbar(source=source, x='name', top='days_played', width=0.9, 
       fill_color=transform.factor_cmap('name', palette=color_list_from_player_list(source.data['name']), 
                                        factors=source.data['name']), 
       line_color=None)
labels = models.LabelSet(x='name', y='days_played', text='days_played',
                         level='glyph', source=source, y_offset=10, text_align='center')
p.add_layout(labels)

# Minor formatting.
p.y_range.start = 0
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None

plotting.show(p)

In [7]:
score_prob_df = scores_df.groupby(["name", "score"]).golf_score.count() / scores_df.groupby("name").golf_score.count()
score_prob_df = score_prob_df.reset_index()
score_prob_df = score_prob_df.rename(columns={"golf_score": "probability"})

scores = ["1", "2", "3", "4", "5", "6", "X"]
names = score_prob_df.name.unique()

probabilities = []
x_categories = []
for s in scores:
    for n in names:
        if score_prob_df[(score_prob_df['name'] == n) & (score_prob_df['score'] == s)].empty:
            x_categories.append((s, n))
            probabilities.append(0)
            continue
        x_categories.append((s, n))
        probabilities.append(score_prob_df[(score_prob_df['name'] == n) & (score_prob_df['score'] == s)].probability.values[0])

source = models.ColumnDataSource(data=dict(x=x_categories, prob=probabilities))

p = plotting.figure(x_range=models.FactorRange(*x_categories), 
                    height=350, title="How does everyone score?", y_axis_label="Probability",
                    toolbar_location=None, tools="")

palette = color_list_from_player_list(names)
p.vbar(x="x", top="prob", width=0.9, source=source, 
       fill_color=transform.factor_cmap('x', palette=palette, factors=names, start=1, end=2),
       line_color=None)
p.xaxis.major_label_orientation = "vertical"

# Minor formatting.
p.y_range.start = 0

plotting.show(p)

In [10]:
p = plotting.figure(title="What is everyone's summed score?", x_axis_label='Date', y_axis_label='Cumulative Score', 
                    width=800, x_axis_type='datetime')

for name, player_score_df in scores_df.sort_values("date", ascending=True).groupby("name"):
    p.line(player_score_df.date, player_score_df.golf_score.cumsum(), legend_label=name, line_width=2, color=PlayerToColor[name])

p.xaxis[0].ticker.desired_num_ticks = 10
p.legend.location = "bottom_left"
plotting.show(p)

In [11]:
def calculate_win_count(scores_df, max_week):
    # We're going to count the number of wins for each person, the create a bar chart.
    scores_df = scores_df[scores_df.week < max_week]

    # Create a variable to keep track of each person's number of wins. And initialize it for each person to zero.
    win_count = {}
    for name in scores_df.name.unique():
        win_count[name] = 0

    # Determine the winner's for each week. We're okay with friendly ties.
    for week_idx, week_scores_df in scores_df.groupby("week"):
        summed_scores_df = week_scores_df.groupby("name").sum()
        winning_players = summed_scores_df[summed_scores_df.golf_score == summed_scores_df.golf_score.min()].index.tolist()

        for name in winning_players:
            win_count[name] = win_count[name] + 1

    return win_count


# Build the data source into the right format
win_count = calculate_win_count(scores_df, 36)
sorted_win_count = dict( sorted(win_count.items(), key=operator.itemgetter(1),reverse=True))
data = {'name': list(sorted_win_count.keys()),
        'wins': list(sorted_win_count.values())}
source = models.ColumnDataSource(data=data)


# Create the figure
p = plotting.figure(title="Who has won the most weeks?", 
                    x_range=source.data['name'],
                    x_axis_label="", 
                    y_axis_label="Number of weeks won", 
                    toolbar_location=None, tools="")
# Populate it with the data and labels
p.vbar(source=source, x='name', top='wins', width=0.9, fill_color=transform.factor_cmap('name', palette=color_list_from_player_list(source.data['name']), factors=source.data['name']), line_color=None)
labels = models.LabelSet(x='name', y='wins', text='wins',
                         level='glyph', source=source, y_offset=10, text_align='center')
p.add_layout(labels)

# Minor formatting.
p.y_range.start = 0
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None

plotting.show(p)

In [12]:
def calculate_loss_count(scores_df):
    # Create a variable to keep track of each person's number of losses. And initialize it for each person to zero.
    loss_count = {}
    for name in scores_df.name.unique():
        loss_count[name] = 0

    # Determine the losers for each week. We're okay with friendly ties.
    for week_idx, week_scores_df in scores_df.groupby("week"):
        summed_scores_df = week_scores_df.groupby("name").sum()
        losing_players = summed_scores_df[summed_scores_df.golf_score == summed_scores_df.golf_score.max()].index.tolist()

        for name in losing_players:
            loss_count[name] = loss_count[name] + 1

    return loss_count


# Build the data source into the right format
loss_count = calculate_loss_count(scores_df)
sorted_loss_count = dict( sorted(loss_count.items(), key=operator.itemgetter(1),reverse=True))
data = {'name': list(sorted_loss_count.keys()),
        'losses': list(sorted_loss_count.values())}
source = models.ColumnDataSource(data=data)


# Create the figure
p = plotting.figure(title="Who has lost the most weeks?", 
                    x_range=source.data['name'],
                    x_axis_label="", 
                    y_axis_label="Number of weeks in dead last", 
                    toolbar_location=None, tools="")
# Populate it with the data and labels
p.vbar(source=source, x='name', top='losses', width=0.9, fill_color=transform.factor_cmap('name', palette=color_list_from_player_list(source.data['name']), factors=source.data['name']), line_color=None)
labels = models.LabelSet(x='name', y='losses', text='losses',
                         level='glyph', source=source, y_offset=10, text_align='center')
p.add_layout(labels)

# Minor formatting.
p.y_range.start = 0
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None

plotting.show(p)

In [13]:
week_scores_df = scores_df.groupby(["week", "name"]).golf_score.sum()

score_string = "| | Name | Score | Week # | \n | :--- | :--- | :--- | :--- \n"

for i, ((week, name), score) in enumerate(week_scores_df.sort_values(ascending=True).head(5).to_frame().iterrows()):
    score_string += f"| {i+1} | {name.capitalize()} | {score.iloc[0]} | {week} \n"
    # score_string += f"| 1 | David | +1 | 12 | \n"

md(f"""
### Top Weekly Scores
{score_string}
""")


### Top Weekly Scores
| | Name | Score | Week # | 
 | :--- | :--- | :--- | :--- 
| 1 | Jamie | -14 | 13 
| 2 | Doug | -11 | 34 
| 3 | Doug | -8 | 29 
| 4 | Lisa | -8 | 5 
| 5 | Jamie | -8 | 4 



In [14]:
week_scores_df = scores_df.groupby(["week", "name"]).golf_score.sum()

score_string = "| | Name | Score | Week # | \n | :--- | :--- | :--- | :--- \n"

for i, ((week, name), score) in enumerate(week_scores_df.sort_values(ascending=False).head(5).to_frame().iterrows()):
    score_string += f"| {i+1} | {name.capitalize()} | +{score.iloc[0]} | {week} \n"
    # score_string += f"| 1 | David | +1 | 12 | \n"

md(f"""
### Bottom Weekly Scores
{score_string}
""")


### Bottom Weekly Scores
| | Name | Score | Week # | 
 | :--- | :--- | :--- | :--- 
| 1 | Madre | +6 | 15 
| 2 | Doug | +6 | 11 
| 3 | Lisa | +6 | 2 
| 4 | Jamie | +6 | 2 
| 5 | Doug | +5 | 25 



### What words were the easiest?

In [15]:
word_archive_df = pd.read_csv("data/wordle_word_archive.csv")
daily_word_averages_df = scores_df.groupby("date").golf_score.mean()

In [16]:
word_archive_df['datetime'] = word_archive_df.Date.apply(lambda date: parser.parse(date))

daily_word_averages = scores_df.groupby("date").golf_score.mean()

daily_word_averages_df = word_archive_df.merge(daily_word_averages.to_frame(), left_on='datetime', right_index=True)

In [17]:
def match_scores_per_day(daily_words_df, scores_df):
    daily_scores = []
    for _, daily_word in daily_words_df.iterrows():
        day_scores = {}
        day_scores['date'] = daily_word.datetime
        day_scores['word'] = daily_word.Word
        day_scores['average_score'] = daily_word.golf_score
        day_scores = {**day_scores, **scores_df[scores_df['date'] == daily_word.datetime].set_index('name').golf_score.to_dict()}

        daily_scores.append(day_scores)

    daily_scores_df = pd.DataFrame(daily_scores)
    return daily_scores_df.set_index('date')

sorted_top_scores_df = daily_word_averages_df.sort_values('golf_score', ascending=True).head(5)
match_scores_per_day(sorted_top_scores_df, scores_df)

Unnamed: 0_level_0,word,average_score,lisa,david,madre,doug,daniel,jamie
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-04-22,PLANT,-2.0,-3,-3,-1,-1,-3.0,-1
2022-12-16,PROBE,-2.0,0,0,-3,-10,0.0,1
2022-07-19,ANGRY,-1.833333,0,0,0,-1,0.0,-10
2022-12-21,LUNAR,-1.4,-1,-1,-1,-3,,-1
2022-11-26,CLEAN,-1.333333,-3,-3,0,0,-1.0,-1


### What words were the hardest?

In [18]:
sorted_top_scores_df = daily_word_averages_df.sort_values('golf_score', ascending=False).head(5)
match_scores_per_day(sorted_top_scores_df, scores_df)

Unnamed: 0_level_0,word,average_score,jamie,david,doug,lisa,daniel,madre
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-09-16,PARER,3.166667,4,4,4,4.0,4,-1
2022-12-26,JUDGE,2.333333,4,2,4,2.0,1,1
2022-09-06,TAUNT,1.5,2,4,1,1.0,2,-1
2022-07-06,FLUFF,1.333333,1,1,1,1.0,2,2
2022-04-19,FOYER,1.2,1,4,1,,0,0
