In [49]:
# Import necessary libraries and modules
import os  # Operating system-related functionalities
import re  # Regular expression operations
import Jetson.GPIO as GPIO  # GPIO control library for Jetson devices
import time  # Time-related operations
import cv2  # OpenCV library for computer vision tasks
from PIL import Image  # Image processing library
import pytesseract  # Python wrapper for Google's Tesseract-OCR engine
import requests  # HTTP library for making web requests
import json  # JSON encoding and decoding
from google.cloud import texttospeech  # Google Cloud Text-to-Speech API for converting text to speech
import numpy as np  # Numerical computing library
from IPython.display import Audio, display  # Displaying audio in Jupyter notebooks
import pyttsx3  # Text-to-speech library for offline use
import base64  # Base64 encoding and decoding


In [None]:
# Import necessary libraries and modules
import torch  # PyTorch deep learning framework
import torchvision.transforms as T  # Transformation functions for image preprocessing
from torchvision.models.detection import ssdlite320_mobilenet_v3_large  # Pre-trained SSD model for object detection
from PIL import Image  # Image processing library

# Check for GPU availability and set device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('device', device)

# Load the pre-trained SSD model for object detection
model = ssdlite320_mobilenet_v3_large(pretrained=True).to(device).eval()


In [1]:
image_path = 'page.jpg'
corrected_image_path = 'corrected_page.jpg'
summary_file_path = "summary.mp3"
GOOLE_API_KEY = "YOUR GEMINI API KEY"
service_account_file='serious-mariner-418204-001f2046ddc3.json'

In [15]:
# Step 1: Motion Sensor Detection and Greeting
def motion_sensor_detection():
    SENSOR_PIN = 23  # Adjust pin according to your setup

    GPIO.setmode(GPIO.BOARD)  # Use physical pin numbering
    GPIO.setup(SENSOR_PIN, GPIO.IN)

    try:
        while True:
            if GPIO.input(SENSOR_PIN):
                # Print greeting message when motion is detected
                #print("Helloo, I'm Sparky. Your AI assistant. I'm here to assist you with your books, papers, research papers, images, graphs, pictures.")
                play_audio('greetings.mp3')
                time.sleep(11)
                # Return True to indicate motion detection
                return True
            else:
                # Print message when no motion is detected
                print('No motion detected')
                
            # Sleep to avoid continuous looping
            time.sleep(2)  # Adjust the sleep time as needed
    finally:
        # Clean up GPIO pins after use
        GPIO.cleanup()


In [16]:
# Step 2: Prompt for Book Detection
def prompt_for_book():
    detected = False
    
    # Keep prompting until a book is detected
    while not detected:
        # Capture image and check for book object
        image_path = capture_image('detect_object.jpg')
        detected = check_for_book_object(image_path)
        
        # If book is detected, return image
        if detected:
            play_audio('book.mp3')
            time.sleep(5)
            return capture_image()
        else:
            # Prompt user to place a book in front of the camera
            print("Please place a book or paper in front of the camera.")
            
            # Sleep to allow time for placing the book
            time.sleep(5)  # Adjust the sleep time as necessary


In [3]:
# Step 3: Check for Book Object
def check_for_book_object(image_path):
    # Define transformation for the image
    transform = T.Compose([
        T.Resize(320),
        T.ToTensor(), 
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Load and transform image
    img = Image.open(image_path).convert("RGB")
    img = transform(img).unsqueeze(0)  # Add batch dimension
    img = img.to(device) 

    # Detect objects in the image
    with torch.no_grad():
        prediction = model(img)

    # Check if 'book' is among the detected classes (class ID for 'book' may vary)
    labels = prediction[0]['labels'].tolist()
    return any(label == 84 for label in labels)  # 84 is often the class ID for 'book' in COCO dataset

    #return True

In [4]:
# Step 4: Capture Image
def capture_image(image_path='page.jpg'):
    # Initialize the camera
    cap = cv2.VideoCapture(0)  # Assumes the first camera is the one you want to use
    
    # Set the resolution (adjust the values based on your camera's supported resolution)
    cap.set(cv2.CAP_PROP_FRAME_WIDTH, 1920)
    cap.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080)

    # Warm-up time for the camera
    time.sleep(2)

    # Capture multiple frames to allow the camera to auto-adjust
    for i in range(5):
        ret, frame = cap.read()

    if ret:
        # Save the final frame
        cv2.imwrite(image_path, frame)

    # Release the camera
    cap.release()
    return image_path


In [5]:
# Step 5: Enhance Image for OCR
def enhance_for_ocr(image_path, brightness_value=20):
    # Read the image
    image = cv2.imread(image_path)
    
    # Rotate the image
    rotated = cv2.rotate(image, cv2.ROTATE_90_CLOCKWISE)

    # Convert to grayscale
    gray_image = cv2.cvtColor(rotated, cv2.COLOR_BGR2GRAY)

    # Create a numpy array with the same shape as the grayscale image
    brightness_array = np.full(gray_image.shape, brightness_value, dtype=np.uint8)

    # Increase brightness
    brightened_image = cv2.add(gray_image, brightness_array)

    # Save the image ready for OCR
    corrected_image_path = 'corrected_image.jpg'  # Define the path for the corrected image
    cv2.imwrite(corrected_image_path, brightened_image)
    time.sleep(1)
    
    return brightened_image



In [6]:
# Step 6: Convert Image to Text using OCR
def image_to_text(image_array):
    # Convert numpy array to PIL image
    image = Image.fromarray(image_array)
    
    # Use Tesseract to perform OCR on the image
    text = pytesseract.image_to_string(image)
    
    # Encode the text to UTF-8 and decode to ASCII
    return text.encode('utf-8').decode('ascii', 'ignore')


In [7]:
# Step 7: Convert Image to Base64 Encoding
def image_to_base64(image_path):
    # Open the image file in binary mode and read its contents
    with open(image_path, "rb") as image_file:
        # Encode the binary data as base64 and decode it as a string
        encoded_string = base64.b64encode(image_file.read()).decode()
    return encoded_string


In [8]:
# Step 8: Summarize Text in Image
def summarize_text_image(base64_string):
    google_api_key = "YOUR_GOOGLE_API_KEY_HERE"  # Replace with your Google API key
    url = f'https://generativelanguage.googleapis.com/v1beta/models/gemini-pro-vision:generateContent?key={google_api_key}'

    # Prepare the JSON payload
    payload = {
      "contents":[
        {
          "parts":[
            {"text": "what is in this image?"},
            {
              "inline_data": {
                "mime_type":"image/jpeg",
                "data": base64_string  # Use the Base64 string obtained from Step 7
              }
            }
          ]
        }
      ]
    }

    # Make the POST request
    headers = {'Content-Type': 'application/json'}
    response = requests.post(url, headers=headers, data=json.dumps(payload))

    # Extract and return the response text
    if response.status_code == 200:
        result = response.json()
        return result['candidates'][0]['content']['parts'][0]['text']
    else:
        return f"Error: {response.status_code}, {response.text}"


In [10]:
# Step 9: Convert Text to Speech
def text_to_speech(text, summary):
    # Combine text and summary
    full_text = f"{text} Summary for the page is {summary}"
    
    # Initialize TextToSpeechClient
    client = texttospeech.TextToSpeechClient.from_service_account_json(service_account_file)
    
    # Set up synthesis input with the combined text
    synthesis_input = texttospeech.SynthesisInput(text=full_text)
    
    # Set voice selection parameters
    voice = texttospeech.VoiceSelectionParams(language_code="en-US", ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL)
    
    # Set audio configuration
    audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3)
    
    # Synthesize speech
    response = client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config)
    
    # Write audio content to file
    with open(summary_file_path, "wb") as out:
        out.write(response.audio_content)
        
    # Print confirmation message
    print(f"Audio content written to file {summary_file_path}")


In [9]:
# Step 10: Play Audio
def play_audio(file_path):
    # Display and play audio file
    return display(Audio(file_path, autoplay=True))


In [11]:
# Step 11: Offline Text-to-Speech
def offline_text_speak(text):
    # Initialize text-to-speech engine
    engine = pyttsx3.init() # object creation

    # Set speech rate
    rate = engine.getProperty('rate')   
    engine.setProperty('rate', 100)     

    # Set voice (0 for male, 1 for female)
    voices = engine.getProperty('voices')       
    engine.setProperty('voice', voices[1].id)   # 1 for female

    # Speak the text
    engine.say(text)
    
    # Run and wait for speech to finish
    engine.runAndWait()
    
    # Stop the engine
    engine.stop()


In [12]:
# Step 12: Check Internet Connection
def check_internet_connection():
    try:
        # Send a request to Google to check internet connectivity
        response = requests.get('http://www.google.com', timeout=1)
        # Return True if the response status code is 200 (OK)
        return response.status_code == 200
    except requests.ConnectionError:
        # Return False if there's a connection error
        return False


In [63]:
# Final Step

if motion_sensor_detection():
    image_array = enhance_for_ocr(prompt_for_book())
    play_audio('processing.mp3')
    text = image_to_text(image_array)
    if check_internet_connection():
        print("Device has internet connection.")

        summary = summarize_text_image(image_to_base64(corrected_image_path))  
        text_to_speech(text, summary)  # Convert summary to speech
        play_audio(summary_file_path)
        
        
    else:
        print("Device does not have internet connection.")
        offline_text_speak(text)

No motion detected
Helloo, I'm Sparky. Your AI assistant. I'm here to assist you with your books, papers, research papers, images, graphs, pictures.
Device has internet connection.
{'candidates': [{'content': {'parts': [{'text': ' The image is a black and white photo of Steve Jobs and Bill Gates sitting on a couch and talking.'}], 'role': 'model'}, 'finishReason': 'STOP', 'index': 0, 'safetyRatings': [{'category': 'HARM_CATEGORY_SEXUALLY_EXPLICIT', 'probability': 'NEGLIGIBLE'}, {'category': 'HARM_CATEGORY_HATE_SPEECH', 'probability': 'NEGLIGIBLE'}, {'category': 'HARM_CATEGORY_HARASSMENT', 'probability': 'NEGLIGIBLE'}, {'category': 'HARM_CATEGORY_DANGEROUS_CONTENT', 'probability': 'NEGLIGIBLE'}]}]}
Audio content written to file summary.mp3
