# Pull Wordle Tweets

This notebook pulls a sample of tweets about the current daily [Wordle](https://www.powerlanguage.co.uk/wordle/) and adds them to the dataset.

## Package installs and imports

In [None]:
pip install tweepy numpy pandas python-dotenv

In [None]:
from collections import Counter
from datetime import datetime
import numpy as np
import os
import pandas as pd
import pytz
import re
import time
import tweepy
from dotenv import load_dotenv

## Configuration secrets

You will need to use your own Twitter API key to run the Twitter portion of this code. 

In [None]:
load_dotenv()

api_key = os.getenv('twitter_api_key')
api_secret = os.getenv('twitter_api_secret')
access_token = os.getenv('twitter_access_token')
access_secret = os.getenv('twitter_access_secret')

auth = tweepy.OAuthHandler(api_key, api_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth)

## Determining our daily Wordle Id

In [None]:
wordle_start = pytz.timezone("US/Mountain").localize(datetime(2021, 6, 19))
now = pytz.utc.localize(datetime.now()).astimezone(pytz.timezone("US/Pacific"))

wordle_id = (now-wordle_start).days
print("Today's wordle_id is: %d (%s)" % (wordle_id, now.strftime("%Y-%m-%d %H:%M PT")))

## Pulling the tweets

In [None]:
wordle_tweets = []

cursor = tweepy.Cursor(api.search_tweets, q="wordle %d" % wordle_id)
tweets = list(cursor.items(5000))
for tweet in tweets:
    wordle_tweets.append((wordle_id, tweet.id, tweet.created_at, tweet.author.screen_name, tweet.text))

print("Pulled %d tweets for wordle %d" % (len(wordle_tweets), wordle_id))

## Determine valid Wordle tweets 

In [None]:
def is_valid_wordle_tweet(tweet, wordle_id): 
    text = (tweet.replace("Y", "y").replace("🟩", "Y")
                 .replace("M", "m").replace("🟨", "M")
                 .replace("N", "n").replace("⬛", "N").replace("⬜", "N"))
    
    for i in range(wordle_id-20, wordle_id+20):
        if i==wordle_id:
            continue
        if str(i) in text:
            return False
    
    if len(re.findall("Wordle %d" % wordle_id, text)) != 1:
        return False

    if re.match("Wordle %d [2-6]/6\n\n[YMN]{5}\n" % wordle_id, text) is None:
        return False
    return True        

## Enter valid Tweets into new_tweets_df

In [None]:
new_tweets_df = pd.DataFrame([tweet for tweet in wordle_tweets if is_valid_wordle_tweet(tweet[4], tweet[0])],
                             columns=["wordle_id", "tweet_id", "tweet_date", "tweet_username", "tweet_text"])
new_tweets_df

## Break tweets into individual guesses and count of green tiles


In [None]:
df_2 = new_tweets_df['tweet_text'].str.split('\n\n', expand=True)

mapping = {df_2.columns[1]:'guess'}
df_2 = df_2.rename(columns=mapping)

df_2.head(50)

In [None]:
guesses_df = df_2['guess'].str.split('\n', expand=True)
mapping = {guesses_df.columns[0]:'g1', guesses_df.columns[1]:'g2',
           guesses_df.columns[2]:'g3', guesses_df.columns[3]:'g4',
           guesses_df.columns[4]:'g5', guesses_df.columns[5]:'g6'}
guesses_df = guesses_df.rename(columns=mapping)
guesses_df.head(50)

In [None]:
guess_list_1 = guesses_df['g1'].str.count("🟩")
guess_list_2 = guesses_df['g2'].str.count("🟩")
guess_list_3 = guesses_df['g3'].str.count("🟩")
guess_list_4 = guesses_df['g4'].str.count("🟩")
guess_list_5 = guesses_df['g5'].str.count("🟩")
guess_list_6 = guesses_df['g6'].str.count("🟩")
tweets_df['g1'] = guess_list_1
tweets_df['g2'] = guess_list_2
tweets_df['g3'] = guess_list_3
tweets_df['g4'] = guess_list_4
tweets_df['g5'] = guess_list_5
tweets_df['g6'] = guess_list_6
tweets_df.head(50)

## Load the previously pulled tweets

In [None]:
previous_df = pd.read_csv("input/tweets.csv", engine='python')
previous_df

## Append the new ones

In [None]:
df = (pd.concat([previous_df,tweets_df])
      .drop_duplicates(subset="tweet_id")
      .sort_values(["wordle_id", "tweet_id"])
      .reset_index(drop=True))
df

## Save the new version of the dataset

In [None]:
if not os.path.exists("updated"):
    os.mkdir("updated")

df.to_csv("updated/tweets.csv", index=False)