Recording...
Recording finished.
Recording...
Recording finished.
Recording...
Recording finished.
Recording...
Recording finished.
Recording...
Recording finished.
Recording...
Recording finished.
Recording...
Recording finished.
Recording...
Recording finished.
Recording...


In [1]:
import cv2
import tensorflow as tf
import numpy as np
import librosa
import sounddevice as sd
from pydub import AudioSegment

class ObjectDetectionModule:
    def __init__(self, weights_path, cfg_path, classes_path):
        self.net = cv2.dnn.readNet(weights_path, cfg_path)
        self.model = cv2.dnn_DetectionModel(self.net)
        self.model.setInputParams(size=(320, 320), scale=1/255)

        with open(classes_path) as file_object:
            self.classes = [class_name.strip() for class_name in file_object.readlines()]

    def detect_objects(self, frame):
        return self.model.detect(frame)

    def draw_objects(self, frame, class_ids, scores, bboxes):
        for class_id, score, bbox in zip(class_ids, scores, bboxes):
            x, y, w, h = bbox
            class_name = self.classes[class_id]
            cv2.putText(frame, class_name, (x, y - 10), cv2.FONT_HERSHEY_PLAIN, 2, (200, 0, 50), 2)
            cv2.rectangle(frame, (x, y), (x + w, y + h), (200, 0, 50), 3)

class AudioRecognitionModule:
    def __init__(self, model_path, labels_path):
        self.interpreter = tf.lite.Interpreter(model_path)
        self.input_details = self.interpreter.get_input_details()
        self.waveform_input_index = self.input_details[0]['index']
        self.output_details = self.interpreter.get_output_details()
        self.scores_output_index = self.output_details[0]['index']

        self.labels = [label.strip() for label in open(labels_path).readlines()]

    def classify_audio(self, audio):
        audio_float = audio.astype(np.float32) / np.iinfo(audio.dtype).max
        y_resampled = librosa.resample(audio_float, orig_sr=44100, target_sr=16000)
        y_norm = librosa.util.normalize(y_resampled)

        self.interpreter.resize_tensor_input(self.waveform_input_index, [y_norm.size], strict=False)
        self.interpreter.allocate_tensors()
        self.interpreter.set_tensor(self.waveform_input_index, y_norm)
        self.interpreter.invoke()

        scores = self.interpreter.get_tensor(self.scores_output_index)
        top_class_index = scores.argmax()

        return self.labels[top_class_index]

def main():
    object_detection_module = ObjectDetectionModule(
        r"C:\Users\USER\Downloads\PCL\Object Detection\dnn_model\yolov4-tiny.weights",
        r"C:\Users\USER\Downloads\PCL\Object Detection\dnn_model\yolov4-tiny.cfg",
        r"C:\Users\USER\Downloads\PCL\Object Detection\dnn_model\classes.txt"
    )
    audio_recognition_module = AudioRecognitionModule(
        r"C:\Users\USER\Downloads\PCL\Speech Rec\lite-model_yamnet_classification_tflite_1.tflite",
        r'C:\Users\USER\Downloads/pcl\Speech Rec\labels.txt'
    )

    
    print("Listening...")
    duration = 15
    sr = 44100
    audio = sd.rec(int(sr * duration), samplerate=sr, channels=1, dtype=np.int16)
    sd.wait()
    print("Processing audio...")

    
    audio_class = audio_recognition_module.classify_audio(audio.flatten())
    print(f"Classified audio as: {audio_class}")


    cap = cv2.VideoCapture(0)

    tracking = False
    bbox = None

    while True:
        # Get frame from the video stream
        ret, frame = cap.read()

        # Check if the frame is valid
        if not ret or frame is None:
            print("Error capturing frame")
            break

        # If the audio is classified as something specific, trigger object detection
        if audio_class == "desired_class" and not tracking:
            class_ids, scores, bboxes = object_detection_module.detect_objects(frame)
            if len(bboxes) > 0:
                bbox = tuple(map(int, bboxes[0]))  # Assume the first detection as the target
                tracking = True

        # Update the bounding box if tracking
        if tracking:
            class_ids, scores, bboxes = object_detection_module.detect_objects(frame)
            if len(bboxes) > 0:
                bbox = tuple(map(int, bboxes[0]))

        # Draw the bounding box
        if bbox is not None:
            x, y, w, h = bbox
            cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)

        # Object detection and tracking
        (class_ids, scores, bboxes) = object_detection_module.detect_objects(frame)
        object_detection_module.draw_objects(frame, class_ids, scores, bboxes)

        cv2.imshow("Frame", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()

if __name__ == "__main__":
    main()




Listening...
Processing audio...
Classified audio as: Speech


KeyboardInterrupt: 

In [3]:
# import cv2
# import tensorflow as tf
# import numpy as np
# import librosa
# import sounddevice as sd
# from pydub import AudioSegment
# from vimeo import VimeoClient
# from vimeo_downloader import Vimeo
# import urllib.parse

# class ObjectDetectionModule:
#     def __init__(self, weights_path, cfg_path, classes_path):
#         self.net = cv2.dnn.readNet(weights_path, cfg_path)
#         self.model = cv2.dnn_DetectionModel(self.net)
#         self.model.setInputParams(size=(320, 320), scale=1/255)

#         with open(classes_path) as file_object:
#             self.classes = [class_name.strip() for class_name in file_object.readlines()]

#     def detect_objects(self, frame):
#         return self.model.detect(frame)

#     def draw_objects(self, frame, class_ids, scores, bboxes):
#         for class_id, score, bbox in zip(class_ids, scores, bboxes):
#             x, y, w, h = bbox
#             class_name = self.classes[class_id]
#             cv2.putText(frame, class_name, (x, y - 10), cv2.FONT_HERSHEY_PLAIN, 2, (200, 0, 50), 2)
#             cv2.rectangle(frame, (x, y), (x + w, y + h), (200, 0, 50), 3)


# class AudioRecognitionModule:
#     def __init__(self, model_path, labels_path):
#         self.interpreter = tf.lite.Interpreter(model_path)
#         self.input_details = self.interpreter.get_input_details()
#         self.waveform_input_index = self.input_details[0]['index']
#         self.output_details = self.interpreter.get_output_details()
#         self.scores_output_index = self.output_details[0]['index']

#         self.labels = [label.strip() for label in open(labels_path).readlines()]

#     def classify_audio(self, audio):
#         audio_float = audio.astype(np.float32) / np.iinfo(audio.dtype).max
#         y_resampled = librosa.resample(audio_float, orig_sr=44100, target_sr=16000)
#         y_norm = librosa.util.normalize(y_resampled)

#         self.interpreter.resize_tensor_input(self.waveform_input_index, [y_norm.size], strict=False)
#         self.interpreter.allocate_tensors()
#         self.interpreter.set_tensor(self.waveform_input_index, y_norm)
#         self.interpreter.invoke()

#         scores = self.interpreter.get_tensor(self.scores_output_index)
#         top_class_index = scores.argmax()

#         return self.labels[top_class_index]

# def authenticate_vimeo():
#     vimeo = Vimeo(
#         client_id='fed6ec55b811ad5927b7e7f50919e9aa25c144ad',
#         client_secret='vlhQjCKpSpH+xcYdqVZxMkfrwLhWBZzVNC3g3nsRXW7HFP7wRq17yRBRwau3InDTmX1I+ylAh5oNIW8xB5YcRwQ29em75vcfJBBqd0yHUzrF9ob6ntdfezsbqhO03bqo',
#         access_token='2a4eed73ff3289a6b2f079b74b83fb87'
#     )
#     return vimeo

# # def authenticate_vimeo():
# #     vimeo = Vimeo()
# #     vimeo.login('onlinemoney929@gmail.com', 'Z6pygN_6#YR9!7j')
# #     return vimeo

# def upload_to_vimeo(client, video_path, title, description):
#     with open(video_path, 'rb') as file:
#         video_uri = client.upload_video(file, name=title, description=description)
#     print(f"Video uploaded! Video URI: {video_uri}")


# def main():
#     object_detection_module = ObjectDetectionModule(
#         r"C:\Users\USER\Downloads\PCL\Object Detection\dnn_model\yolov4-tiny.weights",
#         r"C:\Users\USER\Downloads\PCL\Object Detection\dnn_model\yolov4-tiny.cfg",
#         r"C:\Users\USER\Downloads\PCL\Object Detection\dnn_model\classes.txt"
#     )
#     audio_recognition_module = AudioRecognitionModule(
#         r"C:\Users\USER\Downloads\PCL\Speech Rec\lite-model_yamnet_classification_tflite_1.tflite",
#         r'C:\Users\USER\Downloads/pcl\Speech Rec\labels.txt'
#     )
#     vimeo = authenticate_vimeo()

#     # Record audio only once
#     print("Listening...")
#     duration = 15
#     sr = 44100
#     audio = sd.rec(int(sr * duration), samplerate=sr, channels=1, dtype=np.int16)
#     sd.wait()
#     print("Processing audio...")

#     # Classify the recorded audio
#     audio_class = audio_recognition_module.classify_audio(audio.flatten())
#     print(f"Classified audio as: {audio_class}")

#     # Initialize video capture
#     cap = cv2.VideoCapture(0)

#     tracking = False
#     bbox = None

#     while True:
#         # Get frame from the video stream
#         ret, frame = cap.read()

#         # Check if the frame is valid
#         if not ret or frame is None:
#             print("Error capturing frame")
#             break

#         # If the audio is classified as something specific, trigger object detection
#         if audio_class == "desired_class" and not tracking:
#             class_ids, scores, bboxes = object_detection_module.detect_objects(frame)
#             if len(bboxes) > 0:
#                 bbox = tuple(map(int, bboxes[0]))  # Assume the first detection as the target
#                 tracking = True

#         # Update the bounding box if tracking
#         if tracking:
#             class_ids, scores, bboxes = object_detection_module.detect_objects(frame)
#             if len(bboxes) > 0:
#                 bbox = tuple(map(int, bboxes[0]))

#         # Draw the bounding box
#         if bbox is not None:
#             x, y, w, h = bbox
#             cv2.rectangle(frame, (x, y), (x + w, y + h), (0, 255, 0), 2)

#         # Object detection and tracking
#         (class_ids, scores, bboxes) = object_detection_module.detect_objects(frame)
#         object_detection_module.draw_objects(frame, class_ids, scores, bboxes)

#         cv2.imshow("Frame", frame)
#         if cv2.waitKey(1) & 0xFF == ord('q'):
#             break

#     # Save the recorded video locally
#     local_video_path = 'output_video.avi'
#     out = cv2.VideoWriter(local_video_path, cv2.VideoWriter_fourcc(*'XVID'), 20.0, (640, 480))
#     out.write(frame)
#     out.release()

#     # Upload the recorded video to Vimeo
#     vimeo = authenticate_vimeo()
#     upload_to_vimeo(vimeo, local_video_path, "My Recorded Video", "Video description goes here")

#     cap.release()
#     cv2.destroyAllWindows()

# if __name__ == "__main__":
#     main()


TypeError: __init__() got an unexpected keyword argument 'client_id'

In [1]:
import cv2
import numpy as np
import tensorflow as tf

# Load the YOLOv4 model
model = tf.keras.models.load_model(r'C:\Users\raide\Downloads\PCL\PCL\Object Detection\dnn_model')

# Load class names from a file
with open('C:\Users\raide\Downlo\ads\PCL\PCL\Object Detection\dnn_model\', 'r') as file:
    class_names = [line.strip() for line in file]

# Load an image for object detection
image = cv2.imread(r'C:\Users\raide\Downloads\PCL\PCL\Object Detection\image1.jpg')
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
image = cv2.resize(image, (416, 416))  # YOLOv4 input size

# Preprocess the image
input_image = image / 255.0
input_image = np.expand_dims(input_image, axis=0)

# Perform object detection
predictions = model.predict(input_image)

# Interpret the predictions and draw bounding boxes
# (Implementation depends on the output format of your model)

# Display the result
cv2.imshow('Object Detection', image)
cv2.waitKey(0)
cv2.destroyAllWindows()
