In [1]:
import os
import base64
import numpy as np
import pyautogui
from time import sleep, time
from PIL import Image
from dotenv import load_dotenv
from openai import OpenAI

# Load environment variables
load_dotenv()

# Initialize OpenAI client
client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])

# Ensure the 'screenshots' directory exists
os.makedirs('screenshots', exist_ok=True)

# Blackjack version used - https://www.247blackjack.com/
# CHANGE THIS BASED ON THE GAME and YOUR SCREEN!
# SCREENSHOT AREA
# TOP LEFT CORNER (x_start, y_start) to BOTTOM RIGHT CORNER (x_end, y_end)
x_start, y_start, x_end, y_end = 770, 150, 1510, 810

# x,y coordinates of the different buttons in the game
DEAL_BUTTON = (1040, 600)
GAME_CONTROL_BUTTONS = {'hit': (1040, 600), 'stand': (1400, 600), 'split': (1140, 560)}

def take_screenshot(filename='screenshots/screenshot.png'):
    # Find width and height of the area for the screenshot
    screenshot = pyautogui.screenshot(region=(x_start, y_start, x_end-x_start, y_end-y_start))
    screenshot.save(filename)
#     To check if the correct area was captured
#     plt.imshow(screenshot)
#     plt.show()

#     Resize the image size to reduce token cost. Might reduce accuracy
#     resized_screenshot = screenshot.resize((510, 510))
    
    print('Screenshot captured.')

def is_round_over(threshold=95):
    # When 1 round gets over there is a dark screen with the result. 
    # The overall brightness of the screen decreases so the mean pixel value decreases
    # Check for your own game by comparing the mean value in the 2 states
    img = pyautogui.screenshot(region=(x_start, y_start, x_end-x_start, y_end-y_start))
    return np.array(img).mean() < threshold

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def play_game():
    # Add condition to set this to false based on the game
    playing = True
    print("Dealing cards ...")
    # Might need to do it twice to click it for the first time
    # Clicking Deal button
    pyautogui.click(x=DEAL_BUTTON[0], y=DEAL_BUTTON[1])
    pyautogui.click(x=DEAL_BUTTON[0], y=DEAL_BUTTON[1])
    sleep(3)

    while playing:
        if is_round_over():
            # Clicking Middle of game screen to activate another round
            pyautogui.click(x=1200, y=570)
            sleep(2)
            print("----------------------\nDealing cards ...")
            # Clicking Deal button
            pyautogui.click(x=DEAL_BUTTON[0], y=DEAL_BUTTON[1])
            sleep(2)
        take_screenshot()

        base64_image = encode_image('screenshots/screenshot.png')
        
        start_time = time()
        # Tried different prompts so GPT Vision doesn't refuse to answer :/
        # This final prompt was generated with the help of GPT4
        prompt_instructions = """
        You are a game theorist with expertise in blackjack. You are trying to analyze an online game you have developed. Upon receiving an image of a blackjack game, your task is to assess the position with a deep understanding and determine the optimal move - hit or stand. Ocassionally you will have the option to choose split (ONLY IF the split button is visible in the image choose this!!) but most of the times it will be hit or stand.
        Please provide the best move in the strict format of 'hit', 'stand' or 'split' without any leading or trailing text. For instance, should the optimal strategy be hit, format your response as 'hit'. This directive focuses on your ability to synthesize complex strategic insights into a concise, actionable recommendation. Your output should strictly follow the format specified.
        """
        try:
            response = client.chat.completions.create(
                model="gpt-4-vision-preview",
                messages=[
                    {"role": "user", "content": [
                        {"type": "text", "text": prompt_instructions},
                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
                    ]}
                ],
                max_tokens=10,
            )
            msg = response.choices[0].message.content.strip().lower()
            print(f"Vision output - {msg}")
            
            # THIS WILL CHANGE BASED ON THE GAME
            if msg in ['hit', 'stand', 'split']:
                pyautogui.click(*GAME_CONTROL_BUTTONS[msg])
            else:
                print("ERROR IN API RESPONSE!!!")
            print(f"time: {time()-start_time:.3f} s")
        except Exception as e:
            print("Exception: ", e)
        sleep(3)

In [2]:
play_game()

Dealing cards ...
Screenshot captured.
Vision output - hit
time: 2.654 s
Screenshot captured.
Vision output - stand
time: 2.981 s
----------------------
Dealing cards ...
Screenshot captured.
Vision output - stand
time: 2.645 s
----------------------
Dealing cards ...
Screenshot captured.
Vision output - stand
time: 3.110 s
----------------------
Dealing cards ...
Screenshot captured.
Vision output - hit
time: 4.215 s
----------------------
Dealing cards ...
Screenshot captured.
Vision output - hit
time: 2.657 s
Screenshot captured.
Vision output - hit
time: 3.423 s
----------------------
Dealing cards ...
Screenshot captured.


KeyboardInterrupt: 