## YOLO

### Load Pretrained YOLO Model
* Model varies from `YOLO11n`, `YOLO11s`, `YOLO11m`, `YOLO11l`, `YOLO11x`. (nano, small, medium, large, xlarge.)
    - These are pretrained on COCO dataset, only for detecting 80 pre-trained classes
    - There are also models for segmentation, and pose detection.
* In this code notebook, we will going to also try the ,,Track'' mode, which is available for all detect, segment, and pose models.

In [None]:
from ultralytics import YOLO

detect_model = YOLO("models/yolo11n.pt")
pose_model = YOLO("models/yolo11n-pose.pt")

### Read your camera stream, track the objects as well as human pose

In [None]:
import numpy as np
import cv2 as cv
import copy

cap = cv.VideoCapture(0)
if not cap.isOpened():
    print("Cannot open camera")
    exit()

while True:
    # Capture frame-by-frame
    ret, frame = cap.read()

    # if frame is read correctly ret is True
    if not ret:
        print("Can't receive frame (stream end?). Exiting ...")
        break
    
    # Use below code if your face looks blue.
    # rgb = cv.cvtColor(frame, cv.COLOR_BGR2RGB)
    
    # Display the resulting frame
    det_res = detect_model.track(source=frame, show=False)[0]
    pos_res = pose_model.track(source=frame, show=False)[0]

    all_res = copy.deepcopy(det_res)
    all_res.keypoints = pos_res.keypoints

    res_img = all_res.plot()

    cv.imshow('result', res_img)
    k = cv.waitKey(1)

    if k == ord('q'):
        break
    
# When everything done, release the capture
cap.release()
cv.destroyAllWindows()


### Run below cell if the opencv widget forced to be closed

In [6]:
cap.release()

## OpenAI Whisper

In [None]:
# !pip install -U openai-whisper
# !pip install numpy==2.0
# !pip3 install pvrecorder

In [21]:
import whisper
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

model = whisper.load_model("models/tiny.pt", device=device)
model = model.to(torch.float32)

Using cpu device


In [22]:
import struct
import wave
from pvrecorder import PvRecorder
import time
import numpy as np

def listen(output_path='tmp.wav'):
    device_index = -1

    recorder = PvRecorder(frame_length=1024, device_index=device_index)
    recorder.start()

    wavfile = None

    if output_path is not None:
        wavfile = wave.open(output_path, "w")
        # noinspection PyTypeChecker
        wavfile.setparams((1, 2, recorder.sample_rate, recorder.frame_length, "NONE", "NONE"))

    st = time.time()
    print("=======Start Listening")
            
    while True:
        frame = recorder.read()
        if wavfile is not None:
            wavfile.writeframes(struct.pack("h" * len(frame), *frame))
        if time.time()-st > 10:
            print("=======Stopping Listening")
            break

    recorder.delete()
    if wavfile is not None:
        wavfile.close()

In [32]:
def understand(filename='tmp.wav'):
    audio = whisper.load_audio(filename)
    audio = whisper.pad_or_trim(audio).astype(np.float32)
    
    print(audio.dtype)

    mel = whisper.log_mel_spectrogram(audio).to(torch.float32)

    print(mel.dtype)
    print(next(model.parameters()).dtype)
    result = whisper.decode(model, mel, whisper.DecodingOptions(fp16=False))

    return result.text

In [33]:
#!pip3 install ollama

In [34]:
import ollama

ollama.pull('llama3.2:1b')

ProgressResponse(status='success', completed=None, total=None, digest=None)

In [35]:
from utils.text_to_audio import log_say, listen

text = 'start'
while 'bye' not in text.lower():
    #listen(device_index=0)
    text = understand()
    print('=======Input:', text)
    response = ollama.chat(model='llama3.2:1b', messages=[
      {
        'role': 'user',
        'content': 'Please always answer to me in 50 words. INPUT: [' + text + ']',
      },
    ])
    print('=======Output:', response['message']['content'],)
    log_say(response['message']['content'], True)
    time.sleep(7)


float32
torch.float32
torch.float32
float32
torch.float32
torch.float32
float32
torch.float32
torch.float32


KeyboardInterrupt: 