In [2]:
from google.cloud import vision
import PyPDF2
from pdf2image import convert_from_path
import io
import os
import re
import json
import logging
import pandas as pd
from pathlib import Path

In [3]:
class AnswerExtractor:
    def __init__(self):
        self.answers = {}
        
    def add_answer(self, section, question, sub_question, answer, option=None):
        try:
            if not answer.strip():
                raise ValueError("Empty answer detected")
                
            if section not in self.answers:
                self.answers[section] = {}
                
            if option:
                if option not in self.answers[section]:
                    self.answers[section][option] = {}
                if question not in self.answers[section][option]:
                    self.answers[section][option][question] = {}
                self.answers[section][option][question][sub_question] = answer
            else:
                if question not in self.answers[section]:
                    self.answers[section][question] = {}
                self.answers[section][question][sub_question] = answer
                
        except Exception as e:
            logging.error(f"Error adding answer: {str(e)}")
            raise
            
    def export_answers(self, format='json'):
        if format == 'json':
            return json.dumps(self.answers, indent=4)
        elif format == 'csv':
            flat_data = []
            for section, section_data in self.answers.items():
                for question, question_data in section_data.items():
                    if isinstance(question_data, dict):
                        for sub_q, answer in question_data.items():
                            flat_data.append({
                                'Section': section,
                                'Question': question,
                                'Sub-Question': sub_q,
                                'Answer': answer
                            })
            return pd.DataFrame(flat_data)

In [4]:
def extract_text_from_pdf_using_vision(pdf_path):
    """
    Convert PDF to images and extract text using Google Cloud Vision API
    """
    try:
        # Initialize Vision client
        client = vision.ImageAnnotatorClient()
        
        # Convert PDF to images
        images = convert_from_path(pdf_path)
        
        extracted_text = ""
        
        # Process each page
        for i, image in enumerate(images):
            print(f"Processing page {i+1}/{len(images)}")
            
            # Convert PIL Image to bytes
            img_byte_arr = io.BytesIO()
            image.save(img_byte_arr, format='PNG')
            content = img_byte_arr.getvalue()
            
            # Create vision image object
            vision_image = vision.Image(content=content)
            
            # Perform OCR
            response = client.document_text_detection(image=vision_image)
            
            if response.error.message:
                raise Exception(
                    f'{response.error.message}\nFor more info on error messages, check: '
                    'https://cloud.google.com/apis/design/errors')
                
            # Extract text
            text = response.full_text_annotation.text
            extracted_text += text + "\n"
            
        return extracted_text
    
    except Exception as e:
        logging.error(f"Error in Vision API text extraction: {str(e)}")
        raise

In [5]:
def process_text(text, extractor):
    """
    Process extracted text and organize into answer hierarchy
    """
    # Modify these patterns to match your actual document format
    question_pattern = r'(?:Question|Q)\.?\s*(\d+)\s*[\.(]([a-z])[)\.]'  # More flexible pattern
    section_pattern = r'(?:Section|SECTION)[_\s-]*([A-Z])'
    option_pattern = r'(?:Option|OPTION)[_\s-]*(\d+)'
    
    # Print matches found for debugging
    print("Found questions:", re.findall(question_pattern, text))
    print("Found sections:", re.findall(section_pattern, text))
    print("Found options:", re.findall(option_pattern, text))
    
    # Rest of the processing logic...
    lines = text.split('\n')
    
    current_section = 'Section_A'  # Default section
    current_question = None
    current_option = None
    current_answer = ""
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        print(f"Processing line: {line}")  # Debug print
        
        # Check for section markers
        section_match = re.search(section_pattern, line)
        if section_match:
            current_section = f"Section_{section_match.group(1)}"
            print(f"Found section: {current_section}")  # Debug print
            continue
            
        # Check for option markers
        option_match = re.search(option_pattern, line)
        if option_match:
            current_option = f"Option_{option_match.group(1)}"
            print(f"Found option: {current_option}")  # Debug print
            continue
            
        # Check for questions
        question_match = re.search(question_pattern, line)
        if question_match:
            # If we have a previous question and answer, add it
            if current_question and current_answer:
                print(f"Adding answer: {current_section} - {current_question}({current_sub_q}): {current_answer}")  # Debug print
                extractor.add_answer(current_section, current_question, 
                                   current_sub_q, current_answer.strip(), 
                                   current_option)
            
            # Set up new question
            q_num = question_match.group(1)
            current_sub_q = question_match.group(2)
            current_question = f'Q{q_num}'
            current_answer = line[question_match.end():].strip()
            print(f"Found question: {current_question}({current_sub_q})")  # Debug print
        else:
            # If no question match, append to current answer
            if current_question:
                current_answer += " " + line
    
    # Add the last answer if exists
    if current_question and current_answer:
        print(f"Adding final answer: {current_section} - {current_question}({current_sub_q}): {current_answer}")  # Debug print
        extractor.add_answer(current_section, current_question, 
                           current_sub_q, current_answer.strip(), 
                           current_option)
    
    return extractor

In [6]:
def process_answer_sheet(pdf_path):
    """
    Main function to process PDF answer sheet using Vision API
    """
    try:
        # Extract text from PDF using Vision API
        text = extract_text_from_pdf_using_vision(pdf_path)
        
        # Print raw extracted text for debugging
        print("Raw extracted text:")
        print(text)
        print("-" * 50)
        
        # Create answer extractor
        extractor = AnswerExtractor()
        
        # Process the extracted text
        process_text(text, extractor)
        
        return extractor
    
    except Exception as e:
        logging.error(f"Error processing answer sheet: {str(e)}")
        raise

In [7]:
# Set up Google Cloud credentials

# Example usage
# Replace with your PDF path
pdf_path = "ProperTesting/Jeet-D038.pdf"

try:
    # Process the answer sheet
    extractor = process_answer_sheet(pdf_path)
    
    # Export as JSON
    json_output = extractor.export_answers(format='json')
    print("JSON Output:")
    print(json_output)
    
    # Export as CSV
    csv_output = extractor.export_answers(format='csv')
    print("\nCSV Output:")
    print(csv_output)
    
    # Save to files
    with open('answers.json', 'w') as f:
        json.dump(extractor.answers, f, indent=4)
        
    csv_output.to_csv('answers.csv', index=False)
    
except Exception as e:
    print(f"Error processing answer sheet: {str(e)}")

Processing page 1/5
Processing page 2/5
Processing page 3/5
Processing page 4/5
Processing page 5/5
Raw extracted text:
Nos.
M
Awa
Q1
Ans
1) Asymptotic
function
Can
of
Definer
to
that
one
it is a
knding
Say like
Asymptotic Function
2) Asymptotic notation
you
Example
be
2 ૨
e
Structures
time
Algorithm
3) It tells
ove
Complexities
a
axis
limit
could
in
Data
to
Used
Represent
OR
your
about
the
Best
Case
worst,
time
i.e
time
tapen
to
am or
avgerage and
Complexity
execute
Code
4) The
and
a
particular algorithm
denotions
。 (Big verage
52 (Omega)
o ( the tu)
OK
Case
Woxst
Best
Best
Worst
ave
case
Case
Average Case
Question
Nos.
3
с
of
Marks
Awarded
5
Always
the
or
Much
But
In
data
Structure
We
Consider
that
What
is
Worst
Case
ок
An
Code
that
defina
Algorithm
that
how
Computational
power
Computer
Consumes
+2
Execute.
6) Ex :
int
Count =0
for (int i=0
ich
1++) {
Count ++;
In
this
Code
the
100p
has
time
the
Complexity
Count
of
o (n)
and
Variable
has
O(1)
Time Complexity
7)
0(1
ollogn
< 0(12) 50 

In [8]:
import re

# STEP 1: Paste your raw OCR text here
ocr_text = """
Nos.
M
Awa
Q1
Ans
1) Asymptotic
function
Can
of
Definer
to
that
one
it is a
knding
Say like
Asymptotic Function
2) Asymptotic notation
you
Example
be
2 ૨
e
Structures
time
Algorithm
3) It tells
ove
Complexities
a
axis
limit
could
in
Data
to
Used
Represent
OR
your
about
the
Best
Case
worst,
time
i.e
time
tapen
to
am or
avgerage and
Complexity
execute
Code
4) The
and
a
particular algorithm
denotions
。 (Big verage
52 (Omega)
o ( the tu)
OK
Case
Woxst
Best
Best
Worst
ave
case
Case
Average Case
Question
Nos.
3
с
of
Marks
Awarded
5
Always
the
or
Much
But
In
data
Structure
We
Consider
that
What
is
Worst
Case
ок
An
Code
that
defina
Algorithm
that
how
Computational
power
Computer
Consumes
+2
Execute.
6) Ex :
int
Count =0
for (int i=0
ich
1++) {
Count ++;
In
this
Code
the
100p
has
time
the
Complexity
Count
of
o (n)
and
Variable
has
O(1)
Time Complexity
7)
0(1
ollogn
< 0(12) 50 (n³)
=
• updation
o(n.)
<o (5n) << o(n
n
<OC2") <o (n!)
< O(nn)
This is
0
an
increasing
。(n)
order(growth
Func
O(A)
70(55)
>0 (logn)
>0(2)
Question
Nos.
5
2
Current linked
Tist
1-)
a
9/2
head
5)
Two pointer
Duplicatec
32-2-1 → NULL
Approach to
Remove
Remove Duplicate
List Node List Mode (struct node *
а
a) {
*
=
=
Struct Node. +1
Struct Node * +2
while (+11 = NULL
IF (& value of t₁
दु
= = valve of +2) 2
while (+2! = +1){
+42; +22 next; Free(+2);
3+2 = +2;
42 = +2 → next;
+1+=+1 next;
→
+2= +2 - next;
head
;
head 3
next;
48
+21 = NULL
= =
Marks
Awarded
}
logic :
Using
Two
Pointer
+1
and +2
Val
'
and
Checking
Same
and
ヒコ
Using
Free (+2)
if
their
valves
one
If
not
then
E move
+1
+2
to
next
node
If
==
±2. val
then
traverse
+2
While
loop
till
the
etal = +2
Wode
Evory
time
Question
Nos
6
Marks
Awarded
then
liking
the
to
+2
So
Remove
and
next
the
Current
that
the
again moving
statuting again
+2
Duplicats
t2 to
checking.
6
The
linked
list
would
be
(14
+12
→ NULL
two
max
Sorting this
Node
fre
Final list
using
by given
Value
mv
-
them
max value
#+1
13
4
(mv)
42
Checking
Compering
tist
the
Value
and
and
changing
The
NULL
Space
for Marks
Question
No.
Nume:
Jeet
"Don't
19/3
SAP:
60009230007
$ sub!
R011 NO:
DS
D038
Q.3
Ans
F
(A+B+CC-0)^E) /(F
стан
I)
starting
From
Left
Input
ор stack
Postfix
C
C
A
с
EA
+
( +
EA
B
(+
EAB
C
C+*
C + *
EAB
AB
с
C + *
AB C
ABC *+ C
D
2
A
E
7
A
-77)
ABC * + (D
ABC * + CD
+
A B C + + C D
ABC
A
+CDE
ABC
A
+CDE
XC
C + + C
"""

# STEP 2: Preprocessing (optional - smoothen common OCR issues)
# You can add more patterns to fix typical OCR typos here
ocr_text = ocr_text.replace("Ans", "\nAns")
ocr_text = ocr_text.replace("Question Nos", "\nQuestion Nos")

# STEP 3: Use regex to split based on "Q<number>" or "Question Nos. <number>"
# This pattern matches Q1, Q2 or "Question Nos <number>" with flexibility
question_splits = re.split(r'(?:Q(?:uestion)?\s*Nos?\.?\s*|Q)(\d+)', ocr_text)

# STEP 4: Build question-answer dictionary
qa_dict = {}
for i in range(1, len(question_splits), 2):
    q_num = f"Q{question_splits[i]}"
    answer_text = question_splits[i + 1].strip()

    # Optional cleanup: Remove trailing "Marks Awarded" or unrelated noise
    answer_text = re.split(r'Marks\s+Awarded|Space\s+for\s+Marks|Name:', answer_text, flags=re.IGNORECASE)[0].strip()

    # Clean up short lines and join them (optional: customize for your sheet)
    cleaned_answer = " ".join(line.strip() for line in answer_text.splitlines() if len(line.strip()) > 1)
    
    qa_dict[q_num] = cleaned_answer

# STEP 5: Print or use the dictionary
print("Extracted Question-Answer Dictionary:\n")
for q, ans in qa_dict.items():
    print(f"{q}:\n{ans}\n{'-'*60}")


Extracted Question-Answer Dictionary:

Q1:
Ans 1) Asymptotic function Can of Definer to that one it is a knding Say like Asymptotic Function 2) Asymptotic notation you Example be 2 ૨ Structures time Algorithm 3) It tells ove Complexities axis limit could in Data to Used Represent OR your about the Best Case worst, time i.e time tapen to am or avgerage and Complexity execute Code 4) The and particular algorithm denotions 。 (Big verage 52 (Omega) o ( the tu) OK Case Woxst Best Best Worst ave case Case Average Case
------------------------------------------------------------
Q3:
of
------------------------------------------------------------
Q5:
Current linked Tist 1-) 9/2 head 5) Two pointer Duplicatec 32-2-1 → NULL Approach to Remove Remove Duplicate List Node List Mode (struct node * a) { Struct Node. +1 Struct Node * +2 while (+11 = NULL IF (& value of t₁ दु = = valve of +2) 2 while (+2! = +1){ +42; +22 next; Free(+2); 3+2 = +2; 42 = +2 → next; +1+=+1 next; +2= +2 - next; head head 3 

# Take 2

In [28]:
def extract_text_from_pdf_using_vision(pdf_path):
    """
    Convert PDF to images and extract text using Google Cloud Vision API
    """
    try:
        # Initialize Vision client
        client = vision.ImageAnnotatorClient()
        
        # Convert PDF to images
        images = convert_from_path(pdf_path)
        
        extracted_text = ""
        
        # Process each page
        for i, image in enumerate(images):
            print(f"Processing page {i+1}/{len(images)}")
            
            # Convert PIL Image to bytes
            img_byte_arr = io.BytesIO()
            image.save(img_byte_arr, format='PNG')
            content = img_byte_arr.getvalue()
            
            # Create vision image object
            vision_image = vision.Image(content=content)
            
            # Perform OCR
            response = client.document_text_detection(image=vision_image)
            
            # Extract text
            text = response.full_text_annotation.text
            extracted_text += text + "\n"
            
        return extracted_text
    
    except Exception as e:
        print(f"Error in Vision API text extraction: {str(e)}")
        raise

In [32]:
def process_text(text):
    """
    Process extracted text and return answers dictionary
    """
    # Initialize answers dictionary
    answers_dict = {}
    
    # Modified patterns to match your PDF format
    question_pattern = r'Q\.(\d+)\s*\)?\s*(.+?)(?=Q\.|\Z)'  # Matches Q.1), Q.2, etc. and captures everything until next Q. or end
    
    # Find all questions and their content
    questions = re.finditer(question_pattern, text, re.DOTALL)
    
    for question in questions:
        q_num = question.group(1)
        q_content = question.group(2).strip()
        
        # Store in dictionary
        answers_dict[f'Q{q_num}'] = {
            'content': q_content
        }
        
        # Print for debugging
        print(f"\nFound Question {q_num}:")
        print(q_content)
        print("-" * 50)
    
    return answers_dict

def process_answer_sheet(pdf_path):
    """
    Main function to process PDF answer sheet and return answers dictionary
    """
    try:
        # Extract text from PDF using Vision API
        text = extract_text_from_pdf_using_vision(pdf_path)
        
        # Process the extracted text and get answers dictionary
        answers_dict = process_text(text)
        
        return answers_dict
    
    except Exception as e:
        print(f"Error processing answer sheet: {str(e)}")
        raise

In [33]:
def process_answer_sheet(pdf_path):
    """
    Main function to process PDF answer sheet and return answers dictionary
    """
    try:
        # Extract text from PDF using Vision API
        text = extract_text_from_pdf_using_vision(pdf_path)
        
        # Print raw extracted text for debugging
        print("\nExtracted text:")
        print(text)
        print("-" * 50)
        
        # Process the extracted text and get answers dictionary
        answers_dict = process_text(text)
        
        return answers_dict
    
    except Exception as e:
        print(f"Error processing answer sheet: {str(e)}")
        raise

In [42]:
# Set up Google Cloud credentials

# Process single PDF
pdf_path = "ProperTesting/Spam_AnswerKey.pdf"
answers = process_answer_sheet(pdf_path)

# Print the answers dictionary
print("\nExtracted Answers:")
print(answers)

# Access specific answers
# Example: Print answer for Q1(a) in Section A
if 'Section_A' in answers and 'Q1' in answers['Section_A']:
    print("\nAnswer to Q1(a):", answers['Section_A']['Q1'].get('a'))

Processing page 1/7
Processing page 2/7
Processing page 3/7
Processing page 4/7
Processing page 5/7
Processing page 6/7
Processing page 7/7

Extracted text:
Q.1
BIG O NOTATION
The Big O notation, where O stands for 'order of', is concerned with what happens for very large
values of n. For example, if a sorting algorithm performs n2 operations to sort just n elements,
then that algorithm would be described as an o(n2) algorithm.
If f(n) and g(n) are the functions defined on a positive integer number n, then
f(n) = O(g(n))
That is, f of n is Big-0 of g of n if and only if positive constants c and n exist, such that f(n)
≤cg(n). It means that for large amounts of data, f(n) will grow no more than a constant factor
than g(n). We have seen that the Big O notation provides a strict upper bound for f(n). This
means that the function f(n) can do better but not worse than the specified value. Big O notation
is simply written as f(n) E o(g(n)) or as f(n) = O(g(n)).
Here, n is the problem size an

In [43]:
answers.keys()

dict_keys(['Q1', 'Q2', 'Q3'])

In [45]:
answers['Q1']['content']

"BIG O NOTATION\nThe Big O notation, where O stands for 'order of', is concerned with what happens for very large\nvalues of n. For example, if a sorting algorithm performs n2 operations to sort just n elements,\nthen that algorithm would be described as an o(n2) algorithm.\nIf f(n) and g(n) are the functions defined on a positive integer number n, then\nf(n) = O(g(n))\nThat is, f of n is Big-0 of g of n if and only if positive constants c and n exist, such that f(n)\n≤cg(n). It means that for large amounts of data, f(n) will grow no more than a constant factor\nthan g(n). We have seen that the Big O notation provides a strict upper bound for f(n). This\nmeans that the function f(n) can do better but not worse than the specified value. Big O notation\nis simply written as f(n) E o(g(n)) or as f(n) = O(g(n)).\nHere, n is the problem size and o(g(n)) = {h(n):\npositive constants c, n, such that 0 ≤h (n) ≤ cg(n), Vn 2 no). Hence, we can say that o(g(n))\ncomprises a set of all the functio