In [None]:
#.conda (Python 3.9.6)
%pip install notebook ipykernel

%pip install portaudio
%pip install pyaudio
%pip install openai pyaudio numpy
%pip install python-dotenv
#%pip install opencv
#%pip install "urllib3<2"

%pip install opencv-python

%pip install keyboard
%pip install pynput

%pip install Pillow

In [None]:
# Install missing dependencies for local AI models
%pip install transformers
%pip install torch torchvision

In [None]:
# This script is used to display the promo images at the bottom left and right of the screen.
import cv2
import numpy as np

# Paths and captions
PROMO1_PATH = 'promo1.png'
PROMO2_PATH = 'promo2.png'
PROMO1_CAPTION = "AI & Storytelling"
PROMO2_CAPTION = "Code"

def add_caption_below(image, caption, font=cv2.FONT_HERSHEY_SIMPLEX, font_scale=0.7, thickness=2, pad=10):
    text_size, _ = cv2.getTextSize(caption, font, font_scale, thickness)
    w, h = text_size
    img_h, img_w = image.shape[:2]
    caption_img = np.ones((h + 2*pad, img_w, 3), dtype=np.uint8) * 255
    x = (img_w - w) // 2
    y = pad + h
    cv2.putText(caption_img, caption, (x, y), font, font_scale, (0,0,0), thickness, cv2.LINE_AA)
    return np.vstack([image, caption_img])

# Load promo images
promo1 = cv2.imread(PROMO1_PATH, cv2.IMREAD_COLOR)
promo2 = cv2.imread(PROMO2_PATH, cv2.IMREAD_COLOR)

# Resize QR codes to same height (e.g., 180px)
qr_height = 180
def resize_keep_aspect(img, height):
    h, w = img.shape[:2]
    scale = height / h
    return cv2.resize(img, (int(w*scale), height))
promo1 = resize_keep_aspect(promo1, qr_height)
promo2 = resize_keep_aspect(promo2, qr_height)

# Add captions below each QR
promo1 = add_caption_below(promo1, PROMO1_CAPTION)
promo2 = add_caption_below(promo2, PROMO2_CAPTION)

# Get screen size (using tkinter, works on Mac/Win/Linux)
try:
    import tkinter as tk
    root = tk.Tk()
    root.withdraw()
    screen_width = root.winfo_screenwidth()
    screen_height = root.winfo_screenheight()
except Exception:
    screen_width, screen_height = 1920, 1080

# Margin from the bottom and sides
BOTTOM_MARGIN = 50  # pixels
SIDE_MARGIN = 20    # pixels

# Get image sizes
h1, w1 = promo1.shape[:2]
h2, w2 = promo2.shape[:2]

# Calculate positions
x1 = SIDE_MARGIN  # left window
y1 = screen_height - h1 - BOTTOM_MARGIN

x2 = screen_width - w2 - SIDE_MARGIN  # right window
y2 = screen_height - h2 - BOTTOM_MARGIN

# Show promo1 at bottom left
cv2.imshow("Promo1", promo1)
cv2.moveWindow("Promo1", x1, y1)

# Show promo2 at bottom right
cv2.imshow("Promo2", promo2)
cv2.moveWindow("Promo2", x2, y2)

cv2.waitKey(1)

In [None]:
# Load and display the starter image
import sys
import os

# Import the function from utils.py
from utils import load_starter_image

# Load and display the starter image
success = load_starter_image()

if success:
    print("Starter image loaded successfully!")
else:
    print("Failed to load starter image. Make sure 'starter.png' exists in the root folder.")

# Keep the window open
import cv2
cv2.waitKey(1)

In [None]:
# Force override cache location BEFORE importing aist
import os
import sys

# Set environment variables at the system level
new_cache_dir = os.path.expanduser("~/Documents/huggingface_cache")
os.environ['HF_HOME'] = new_cache_dir
os.environ['TRANSFORMERS_CACHE'] = new_cache_dir
os.environ['HF_DATASETS_CACHE'] = new_cache_dir
os.environ['HF_HUB_CACHE'] = new_cache_dir

# Also set the cache directory for diffusers specifically
os.environ['DIFFUSERS_CACHE'] = new_cache_dir

# Create the directory
os.makedirs(new_cache_dir, exist_ok=True)
print(f"✓ Cache location set to: {new_cache_dir}")

# Verify the environment variables are set
print("Environment variables:")
for var in ['HF_HOME', 'TRANSFORMERS_CACHE', 'HF_HUB_CACHE', 'DIFFUSERS_CACHE']:
    print(f"  {var}: {os.environ.get(var, 'NOT SET')}")

In [None]:
#Installing local image generation models from the AI & storytelling course (which wrap Huggingface models).
#This is to provide a free alternative to OpenAI calls (generally faster if you have a GPU, reduced image 
# quality, but arguably more interesting).

#@title Install required packages
#@markdown Run this first so that we can configure the notebook to have our code available.
%pip install https://github.com/pkage/ai-storytelling-backstage/archive/main.zip#subdirectory=code/

from aist.common import is_gpu_available


print(f'GPU is {"" if is_gpu_available() else "NOT "}available on this instance.')

import os
from PIL import Image
from aist import image
#from google.colab import files
from aist.common import render_output_text
from IPython.display import display


def get_concat_h_blank(im1, im2, color=(0, 0, 0)):
    dst = Image.new('RGB', (im1.width + im2.width, max(im1.height, im2.height)), color)
    dst.paste(im1, (0, 0))
    dst.paste(im2, (im1.width, 0))
    return dst

#import locale
#locale.getpreferredencoding = lambda: "UTF-8"
#locale.getpreferredencoding = lambda do_setlocale=False: "UTF-8"

In [None]:
# Configure PyTorch for Apple Silicon GPU
import torch

# Check if MPS (Metal Performance Shaders) is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("✓ Apple Silicon GPU (MPS) is available!")
    print(f"Using device: {device}")
else:
    device = torch.device("cpu")
    print("⚠ MPS not available, using CPU")

# Test GPU availability
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"MPS available: {torch.backends.mps.is_available()}")
print(f"Current device: {device}")

# Set the device globally for the session
torch.set_default_device(device)

In [None]:
#Test local image generation

PROMPT = 'a dank cellar with multiple stalactites running away from gas materials' #@param {type: 'string'}

SEED     = 42   #@param {type: 'integer'}
ROUNDS   = 3   #@param {type: 'integer'}
HEIGHT   = 512  #@param {type: 'integer'}
WIDTH    = 512  #@param {type: 'integer'}

# Try to force GPU usage
try:
    result = image.stable_diffusion(
        PROMPT,
        rounds=ROUNDS,
        dims=(WIDTH,HEIGHT),
        seed=SEED,
        accelerate=True  # This should enable GPU acceleration
    )
    print("✓ Image generation completed with GPU acceleration!")
except Exception as e:
    print(f"⚠ GPU acceleration failed: {e}")
    print("Falling back to CPU...")
    # Fallback to CPU
    result = image.stable_diffusion(
        PROMPT,
        rounds=ROUNDS,
        dims=(WIDTH,HEIGHT),
        seed=SEED,
        accelerate=False
    )

In [None]:
# Display the generated image
print("Displaying the generated image...")
display(result)

# Also save it to see where it goes
result.save("test_generated_image.png")
print("Image saved as 'test_generated_image.png' in current directory")

# Show image info
print(f"Image size: {result.size}")
print(f"Image mode: {result.mode}")

In [None]:
# Test Display the local generated image in the same window as other generated images
import cv2
import numpy as np

# Convert PIL image to OpenCV format
opencv_image = cv2.cvtColor(np.array(result), cv2.COLOR_RGB2BGR)

# Resize to match your window dimensions
opencv_image = cv2.resize(opencv_image, (1536, 1536))

# Display in the same window as other generated images
cv2.imshow("Generated Image", opencv_image)
cv2.moveWindow("Generated Image", 50, 20) 
cv2.waitKey(1)

print("Image displayed in 'Generated Image' window")

In [None]:
#Identify the microphone. Use the appropriate index for the microphone,
# by setting it as the value for the variable "DEVICE_INDEX" in the main 
# loop (last cell in notebook, or in the .py version).
import pyaudio

p = pyaudio.PyAudio()

for i in range(p.get_device_count()):
    print(p.get_device_info_by_index(i))

In [None]:
#You will need to add your OpenAIAPI key in the .env file, located in your AICAntSing folder.
import os

from dotenv import load_dotenv
load_dotenv()

API_KEY = os.getenv("MY_API_KEY")
print(API_KEY)

In [None]:
#Test the API key.
import openai
client = openai.OpenAI(api_key=API_KEY)

response = client.chat.completions.create(
    model="gpt-3.5-turbo", # model to use from Models Tab
    messages = [
        {
            "role": "user",
            "content": "this is a test request, write a short poem"
        }
    ]
)

print(response)

In [None]:
import sys
print(sys.executable)

In [None]:
# [Deprecated] This code is used to display the promo images on the screen. They show up near the top left together.
import cv2
import numpy as np

# Paths and captions
PROMO1_PATH = 'promo1.png'
PROMO2_PATH = 'promo2.png'
PROMO1_CAPTION = "AI & Storytelling"
PROMO2_CAPTION = "Code"

def add_caption_below(image, caption, font=cv2.FONT_HERSHEY_SIMPLEX, font_scale=0.7, thickness=2, pad=10):
    text_size, _ = cv2.getTextSize(caption, font, font_scale, thickness)
    w, h = text_size
    img_h, img_w = image.shape[:2]
    caption_img = np.ones((h + 2*pad, img_w, 3), dtype=np.uint8) * 255
    x = (img_w - w) // 2
    y = pad + h
    cv2.putText(caption_img, caption, (x, y), font, font_scale, (0,0,0), thickness, cv2.LINE_AA)
    return np.vstack([image, caption_img])

# Load promo images
promo1 = cv2.imread(PROMO1_PATH, cv2.IMREAD_COLOR)
promo2 = cv2.imread(PROMO2_PATH, cv2.IMREAD_COLOR)

# Resize QR codes to same height (e.g., 180px)
qr_height = 180
def resize_keep_aspect(img, height):
    h, w = img.shape[:2]
    scale = height / h
    return cv2.resize(img, (int(w*scale), height))
promo1 = resize_keep_aspect(promo1, qr_height)
promo2 = resize_keep_aspect(promo2, qr_height)

# Add captions below each QR
promo1 = add_caption_below(promo1, PROMO1_CAPTION)
promo2 = add_caption_below(promo2, PROMO2_CAPTION)

# Make them the same height
h1, w1 = promo1.shape[:2]
h2, w2 = promo2.shape[:2]
if h1 != h2:
    maxh = max(h1, h2)
    promo1 = cv2.copyMakeBorder(promo1, 0, maxh-h1, 0, 0, cv2.BORDER_CONSTANT, value=[255,255,255])
    promo2 = cv2.copyMakeBorder(promo2, 0, maxh-h2, 0, 0, cv2.BORDER_CONSTANT, value=[255,255,255])

# Concatenate QR codes side by side
promo_img = np.hstack([promo1, promo2])

# Get screen size (using tkinter, works on Mac/Win/Linux)
try:
    import tkinter as tk
    root = tk.Tk()
    root.withdraw()
    screen_width = root.winfo_screenwidth()
    screen_height = root.winfo_screenheight()
except Exception:
    screen_width, screen_height = 1920, 1080

# Margin from the bottom (adjust as needed)
BOTTOM_MARGIN = 30  # pixels

# Calculate position for bottom left, with margin
img_h, img_w = promo_img.shape[:2]
x = 0  # left edge
y = screen_height - img_h - BOTTOM_MARGIN
if y < 0:
    y = 0  # Don't go off screen

# Show the promo image at bottom left, slightly up
cv2.imshow("Promo", promo_img)
cv2.moveWindow("Promo", x, y)
cv2.waitKey(1)

In [None]:
import openai
import pyaudio
import wave
import numpy as np
import time
import keyboard
import threading
from pynput import keyboard as pynput_keyboard
#import multiprocessing

import sys
import os

from dotenv import load_dotenv
load_dotenv()

USE_LOCAL_MODEL = False

artist_song_info = input('Enter the artist and song or relevant details (leave blank to skip). then press Enter: ').strip()

ABS_PATH = os.getenv("MY_ABS_PATH")
sys.path.append(os.path.dirname(os.path.abspath("ABS_PATH")))
from utils import generate_image  # Importing the function from a separate file

API_KEY = os.getenv("MY_API_KEY")

# Constants
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK = 1024
DEVICE_INDEX = 2  # Change this to the desired input device index

AUDIO_FILE = os.path.expanduser("~/Desktop/output.wav")  # Temporary file for transcription. Save to Desktop
#AUDIO_FILE = "temp_audio.wav"  # Temporary file for transcription

# Global pause flag
pause_flag = threading.Event()
pause_flag.clear()

def on_press(key):
    try:
        if key == pynput_keyboard.Key.f8:
            pause_flag.set()
            print("\n--- PAUSED --- (Press F9 to resume)")
        elif key == pynput_keyboard.Key.f9:
            if pause_flag.is_set():
                pause_flag.clear()
                print("Resumed.")
    except Exception:
        pass

# Start listener in background
listener = pynput_keyboard.Listener(on_press=on_press)
listener.daemon = True
listener.start()

# Function to capture audio from the microphone
def record_audio(duration=3):
    """Records audio for a specified duration and saves it to a file."""
    audio = pyaudio.PyAudio()
    stream = audio.open(format=FORMAT, channels=CHANNELS,
                        rate=RATE, input=True,
                        frames_per_buffer=CHUNK,
                        input_device_index=DEVICE_INDEX)

    print("Recording...")
    frames = []
    for _ in range(0, int(RATE / CHUNK * duration)):
        data = stream.read(CHUNK)
        frames.append(data)

    print("Recording stopped.")
    stream.stop_stream()
    stream.close()
    audio.terminate()

    with wave.open(AUDIO_FILE, 'wb') as wf:
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(audio.get_sample_size(FORMAT))
        wf.setframerate(RATE)
        wf.writeframes(b''.join(frames))

# Function to transcribe recorded audio
def transcribe_audio():
    """Uses OpenAI Whisper API to transcribe recorded audio."""
    client = openai.OpenAI(api_key=API_KEY)  # Add your API key here

    with open(AUDIO_FILE, "rb") as audio_file:
        transcript = client.audio.transcriptions.create(
            model="whisper-1",
            file=audio_file
        )

    return transcript.text  # Access the text field properly

# Function to check for meaningful content and spawn a process
def check_and_spawn_process(transcript, function_to_run):
    import string
    
    # Remove spaces and punctuation, convert to lowercase
    cleaned_text = transcript.lower().translate(str.maketrans('', '', string.punctuation + ' '))
    
    # Check if there are at least 4 meaningful characters
    if len(cleaned_text) >= 4:
        print(f"Meaningful content detected! Generating image...")
        # Use the full transcript as the sentence
        sentence = transcript.strip()
        # Prepend artist/song info if provided
        if artist_song_info:
            prompt = f"{artist_song_info}: {sentence}"
        else:
            prompt = sentence
        function_to_run(prompt, API_KEY, use_local_model=USE_LOCAL_MODEL)
        return True
    return False


# Main function to handle live transcription
needs_prompt = False

def main():
    global artist_song_info, needs_prompt
    while True:
        if pause_flag.is_set():
            # Wait until unpaused
            while pause_flag.is_set():
                time.sleep(0.1)
            # Set flag to prompt after current operation
            needs_prompt = True

        record_audio(duration=3)
        transcript = transcribe_audio()
        print(f"Transcription: {transcript}")
        check_and_spawn_process(transcript, generate_image)
        import cv2
        cv2.waitKey(1)
        time.sleep(0.3)

        # Prompt for new info if needed
        if needs_prompt:
            new_info = input("Enter new artist and song info (leave blank to keep current): ").strip()
            if new_info:
                artist_song_info = new_info
            needs_prompt = False

if __name__ == "__main__":
    main()
