In [1]:
import os

import cv2
import pytesseract
import openai

from openai_utils.src.openai_utils import (
    set_openai_api_key,
    get_text_from_openai_chat_completion,
    get_openai_chat_completion,
)

In [2]:
# Set openai api key with helper
set_openai_api_key(openai)
# Path to Tesseract executable (replace with your own path)
pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/bin/tesseract'

In [3]:
PROMPT = '''
I have taken a screenshot of a description of a song that I'm listening to. I already used OCR to parse the text from the image. Now I would like you to categorize that text into a particular format.

I am trying to capture the following information:
| Mental state | Activity | Genre | Track title | Neural effect level | Musical complexity | Mood | Instrumentation |

Here is a sample input from a previous screenshot:
=====BEGIN SAMPLE=====
7 = full capacity

a. ELECTRONIC - HIGH NEURAL EFFECT

Sw ODA

track Information similar tracks

mental state activity
focus deep work

musical complexity neural effect level

= medium = high

mood
chill - upbeat

instrumentation
electronic percussion + arp synth - arp synth bass
=====END SAMPLE=====

Here is the correct output for the sample input:
=====BEGIN OUTPUT=====
| Focus | Deep work | Electronic | Full capacity | High | Medium | Chill - upbeat | Electronic percussion - Arp synth - Arp synth bass |
=====END OUTPUT=====

Please categorize the following text into the correct format. You can use the sample input as a guide. If you have any questions, please ask. Thank you!

=====BEGIN INPUT=====
{input}
=====END INPUT=====
'''

In [4]:
def get_text_from_screenshot(filename: str) -> str:
    image = cv2.imread(os.path.expanduser(image_path))

    # Convert image to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Apply threshold to convert grayscale image to binary image
    threshold = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

    # Apply dilation and erosion to remove noise
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    clean_image = cv2.morphologyEx(threshold, cv2.MORPH_CLOSE, kernel)

    # Use Tesseract to extract text from image
    text: str = pytesseract.image_to_string(clean_image)

    # Return extracted text
    return text

In [5]:
image_path = "data/computer_text.jpg"

In [6]:
input_text = get_text_from_screenshot(image_path)
prompt_with_input = PROMPT.format(input=input_text)

In [7]:
completion = get_openai_chat_completion(prompt_with_input)
completion

<OpenAIObject chat.completion id=chatcmpl-78EtIVJEWvVQBO6nA4hrqEFmm2cXe at 0x160f53f10> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "| Focus | Deep work | Lo-fi | Topaz | High | High | Driving + Chill - Hopeful - Downtempo | Electronic percussion - Synth bass - Chimes/Bells - Organic percussion - Electric keys - Processed vocals |",
        "role": "assistant"
      }
    }
  ],
  "created": 1682198220,
  "id": "chatcmpl-78EtIVJEWvVQBO6nA4hrqEFmm2cXe",
  "model": "gpt-3.5-turbo-0301",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 49,
    "prompt_tokens": 384,
    "total_tokens": 433
  }
}

In [8]:
get_text_from_openai_chat_completion(completion)

'| Focus | Deep work | Lo-fi | Topaz | High | High | Driving + Chill - Hopeful - Downtempo | Electronic percussion - Synth bass - Chimes/Bells - Organic percussion - Electric keys - Processed vocals |'