In [17]:
from google.cloud import vision
import PyPDF2
from pdf2image import convert_from_path
import io
import os
import re
import json
import logging
import pandas as pd
from pathlib import Path

In [18]:
class AnswerExtractor:
    def __init__(self):
        self.answers = {}
        
    def add_answer(self, section, question, sub_question, answer, option=None):
        try:
            if not answer.strip():
                raise ValueError("Empty answer detected")
                
            if section not in self.answers:
                self.answers[section] = {}
                
            if option:
                if option not in self.answers[section]:
                    self.answers[section][option] = {}
                if question not in self.answers[section][option]:
                    self.answers[section][option][question] = {}
                self.answers[section][option][question][sub_question] = answer
            else:
                if question not in self.answers[section]:
                    self.answers[section][question] = {}
                self.answers[section][question][sub_question] = answer
                
        except Exception as e:
            logging.error(f"Error adding answer: {str(e)}")
            raise
            
    def export_answers(self, format='json'):
        if format == 'json':
            return json.dumps(self.answers, indent=4)
        elif format == 'csv':
            flat_data = []
            for section, section_data in self.answers.items():
                for question, question_data in section_data.items():
                    if isinstance(question_data, dict):
                        for sub_q, answer in question_data.items():
                            flat_data.append({
                                'Section': section,
                                'Question': question,
                                'Sub-Question': sub_q,
                                'Answer': answer
                            })
            return pd.DataFrame(flat_data)

In [19]:
def extract_text_from_pdf_using_vision(pdf_path):
    """
    Convert PDF to images and extract text using Google Cloud Vision API
    """
    try:
        # Initialize Vision client
        client = vision.ImageAnnotatorClient()
        
        # Convert PDF to images
        images = convert_from_path(pdf_path)
        
        extracted_text = ""
        
        # Process each page
        for i, image in enumerate(images):
            print(f"Processing page {i+1}/{len(images)}")
            
            # Convert PIL Image to bytes
            img_byte_arr = io.BytesIO()
            image.save(img_byte_arr, format='PNG')
            content = img_byte_arr.getvalue()
            
            # Create vision image object
            vision_image = vision.Image(content=content)
            
            # Perform OCR
            response = client.document_text_detection(image=vision_image)
            
            if response.error.message:
                raise Exception(
                    f'{response.error.message}\nFor more info on error messages, check: '
                    'https://cloud.google.com/apis/design/errors')
                
            # Extract text
            text = response.full_text_annotation.text
            extracted_text += text + "\n"
            
        return extracted_text
    
    except Exception as e:
        logging.error(f"Error in Vision API text extraction: {str(e)}")
        raise

In [25]:
def process_text(text, extractor):
    """
    Process extracted text and organize into answer hierarchy
    """
    # Modify these patterns to match your actual document format
    question_pattern = r'(?:Question|Q)\.?\s*(\d+)\s*[\.(]([a-z])[)\.]'  # More flexible pattern
    section_pattern = r'(?:Section|SECTION)[_\s-]*([A-Z])'
    option_pattern = r'(?:Option|OPTION)[_\s-]*(\d+)'
    
    # Print matches found for debugging
    print("Found questions:", re.findall(question_pattern, text))
    print("Found sections:", re.findall(section_pattern, text))
    print("Found options:", re.findall(option_pattern, text))
    
    # Rest of the processing logic...
    lines = text.split('\n')
    
    current_section = 'Section_A'  # Default section
    current_question = None
    current_option = None
    current_answer = ""
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        print(f"Processing line: {line}")  # Debug print
        
        # Check for section markers
        section_match = re.search(section_pattern, line)
        if section_match:
            current_section = f"Section_{section_match.group(1)}"
            print(f"Found section: {current_section}")  # Debug print
            continue
            
        # Check for option markers
        option_match = re.search(option_pattern, line)
        if option_match:
            current_option = f"Option_{option_match.group(1)}"
            print(f"Found option: {current_option}")  # Debug print
            continue
            
        # Check for questions
        question_match = re.search(question_pattern, line)
        if question_match:
            # If we have a previous question and answer, add it
            if current_question and current_answer:
                print(f"Adding answer: {current_section} - {current_question}({current_sub_q}): {current_answer}")  # Debug print
                extractor.add_answer(current_section, current_question, 
                                   current_sub_q, current_answer.strip(), 
                                   current_option)
            
            # Set up new question
            q_num = question_match.group(1)
            current_sub_q = question_match.group(2)
            current_question = f'Q{q_num}'
            current_answer = line[question_match.end():].strip()
            print(f"Found question: {current_question}({current_sub_q})")  # Debug print
        else:
            # If no question match, append to current answer
            if current_question:
                current_answer += " " + line
    
    # Add the last answer if exists
    if current_question and current_answer:
        print(f"Adding final answer: {current_section} - {current_question}({current_sub_q}): {current_answer}")  # Debug print
        extractor.add_answer(current_section, current_question, 
                           current_sub_q, current_answer.strip(), 
                           current_option)
    
    return extractor

In [26]:
def process_answer_sheet(pdf_path):
    """
    Main function to process PDF answer sheet using Vision API
    """
    try:
        # Extract text from PDF using Vision API
        text = extract_text_from_pdf_using_vision(pdf_path)
        
        # Print raw extracted text for debugging
        print("Raw extracted text:")
        print(text)
        print("-" * 50)
        
        # Create answer extractor
        extractor = AnswerExtractor()
        
        # Process the extracted text
        process_text(text, extractor)
        
        return extractor
    
    except Exception as e:
        logging.error(f"Error processing answer sheet: {str(e)}")
        raise

In [27]:
# Set up Google Cloud credentials

# Example usage
# Replace with your PDF path
pdf_path = "ProperTesting/Joel-D041.pdf"

try:
    # Process the answer sheet
    extractor = process_answer_sheet(pdf_path)
    
    # Export as JSON
    json_output = extractor.export_answers(format='json')
    print("JSON Output:")
    print(json_output)
    
    # Export as CSV
    csv_output = extractor.export_answers(format='csv')
    print("\nCSV Output:")
    print(csv_output)
    
    # Save to files
    with open('answers.json', 'w') as f:
        json.dump(extractor.answers, f, indent=4)
        
    csv_output.to_csv('answers.csv', index=False)
    
except Exception as e:
    print(f"Error processing answer sheet: {str(e)}")

Processing page 1/5
Processing page 2/5
Processing page 3/5
Processing page 4/5
Processing page 5/5
Raw extracted text:
Question
Nos.
Marks
Awarded
91. Big O
Big theta
Big 2
Big O gives us the upper limit of
the case the upper limit (worst case)
Big theta gives wis the lower bound
of the case ( Best Case)
Ω
Big - gives us the combination
both ·Bigo and Big 2
of
Big (0) notations helps was to understand
the worst case of a given algorithm
it is generally seepresented as
O(1) - O(logn) - O(n)
→ O(n³)
O(nlogn)
where OC) is beat.
O(nlogn).
Question
Nos.
3
Marks
Awarded
92
1. Insertion at the start
struct node § 1
int vali
Struts Godbe preser
struct node * next;
3
function Insertaṭstart ( strict node ** head, data
{
struct node * newnode;
newnode val = data;
→val
new node → next = head;
while C
struct node temp = head
→
• while 6 temp > next != head).
3.
temp
ད་
temp next.
temp→ next = newnode;
Jusaction Deletion Of Lantnode (struct node ** head, d
Of
{ struct node Premp
struct node.newerad

# Take 2

In [28]:
def extract_text_from_pdf_using_vision(pdf_path):
    """
    Convert PDF to images and extract text using Google Cloud Vision API
    """
    try:
        # Initialize Vision client
        client = vision.ImageAnnotatorClient()
        
        # Convert PDF to images
        images = convert_from_path(pdf_path)
        
        extracted_text = ""
        
        # Process each page
        for i, image in enumerate(images):
            print(f"Processing page {i+1}/{len(images)}")
            
            # Convert PIL Image to bytes
            img_byte_arr = io.BytesIO()
            image.save(img_byte_arr, format='PNG')
            content = img_byte_arr.getvalue()
            
            # Create vision image object
            vision_image = vision.Image(content=content)
            
            # Perform OCR
            response = client.document_text_detection(image=vision_image)
            
            # Extract text
            text = response.full_text_annotation.text
            extracted_text += text + "\n"
            
        return extracted_text
    
    except Exception as e:
        print(f"Error in Vision API text extraction: {str(e)}")
        raise

In [29]:
def process_text(text):
    """
    Process extracted text and return answers dictionary
    """
    # Initialize answers dictionary
    answers_dict = {}
    
    # Define patterns - modify these according to your PDF format
    question_pattern = r'Q(\d+)\s*$$([a-z])$$'
    section_pattern = r'Section[_\s]*([A-Z])'
    option_pattern = r'Option[_\s]*(\d+)'
    
    # Split text into lines
    lines = text.split('\n')
    
    current_section = 'Section_A'  # Default section
    current_question = None
    current_option = None
    current_answer = ""
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        # Check for section markers
        section_match = re.search(section_pattern, line)
        if section_match:
            current_section = f"Section_{section_match.group(1)}"
            if current_section not in answers_dict:
                answers_dict[current_section] = {}
            continue
            
        # Check for option markers
        option_match = re.search(option_pattern, line)
        if option_match:
            current_option = f"Option_{option_match.group(1)}"
            if current_option not in answers_dict[current_section]:
                answers_dict[current_section][current_option] = {}
            continue
            
        # Check for questions
        question_match = re.search(question_pattern, line)
        if question_match:
            # If we have a previous question and answer, store it
            if current_question and current_answer:
                store_answer(answers_dict, current_section, current_question, 
                           current_sub_q, current_answer.strip(), current_option)
            
            # Set up new question
            q_num = question_match.group(1)
            current_sub_q = question_match.group(2)
            current_question = f'Q{q_num}'
            current_answer = line[question_match.end():].strip()
        else:
            # If no question match, append to current answer
            if current_question:
                current_answer += " " + line
    
    # Store the last answer if exists
    if current_question and current_answer:
        store_answer(answers_dict, current_section, current_question, 
                    current_sub_q, current_answer.strip(), current_option)
    
    return answers_dict

def store_answer(answers_dict, section, question, sub_question, answer, option=None):
    """
    Helper function to store answer in the dictionary with proper hierarchy
    """
    if section not in answers_dict:
        answers_dict[section] = {}
        
    if option:
        if option not in answers_dict[section]:
            answers_dict[section][option] = {}
        if question not in answers_dict[section][option]:
            answers_dict[section][option][question] = {}
        answers_dict[section][option][question][sub_question] = answer
    else:
        if question not in answers_dict[section]:
            answers_dict[section][question] = {}
        answers_dict[section][question][sub_question] = answer

In [30]:
def process_answer_sheet(pdf_path):
    """
    Main function to process PDF answer sheet and return answers dictionary
    """
    try:
        # Extract text from PDF using Vision API
        text = extract_text_from_pdf_using_vision(pdf_path)
        
        # Print raw extracted text for debugging
        print("\nExtracted text:")
        print(text)
        print("-" * 50)
        
        # Process the extracted text and get answers dictionary
        answers_dict = process_text(text)
        
        return answers_dict
    
    except Exception as e:
        print(f"Error processing answer sheet: {str(e)}")
        raise

In [31]:
# Set up Google Cloud credentials

# Process single PDF
pdf_path = "ProperTesting/Krisha-D053.pdf"
answers = process_answer_sheet(pdf_path)

# Print the answers dictionary
print("\nExtracted Answers:")
print(answers)

# Access specific answers
# Example: Print answer for Q1(a) in Section A
if 'Section_A' in answers and 'Q1' in answers['Section_A']:
    print("\nAnswer to Q1(a):", answers['Section_A']['Q1'].get('a'))

Processing page 1/7
Processing page 2/7
Processing page 3/7
Processing page 4/7
Processing page 5/7
Processing page 6/7
Processing page 7/7

Extracted text:
TIC
msi
are
ate
Our
ate,
ate:
tes
tion
di
esl
r ar
ans
to
wer
Question
Nos.
Asymptotic notions are used to
check the time st& complexity of
algorithms and to find out which
algorithm is the most time efficient.
when input size is large enough.
Big O → This is used to
2) Big
0
represent the worst case of the time
complexity. This suggests the max time that an algorithm.
can take for
completion
e an
th s
oug
sho
ot b
are f
e in.
nk, v
on w
writi
sho
По
f(n)
In the above graph (g(n) and fen)
we
can say that
are functions and
cgcn) = 0 (f(n))
cg (n) = f(n) and n>no.
if and only if
Question
Nos.
3
the
2) Big. W + This is used to represent
best of time complexity of an
algorithm. This suggests the minimum
time algorithm can can for take for its.
completion.
Marks
Awarded
f(n)
<(9cm)
(gem = cor from the
above graph we can say that f(n) w(