In [1]:
# !pip install PyPDF2 # if needded then run

In [2]:
import pandas as pd
import PyPDF2
import re

def extract_questions_from_pdf(pdf_path):
    questions = []
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        num_pages = len(reader.pages)
        pattern = re.compile(r'^\d+\s+')

        for page_num in range(num_pages):
            page = reader.pages[page_num]
            text = page.extract_text()
            lines = text.split('\n')
            current_question = ""
            for line in lines:
                if not line.strip() or "Exam Date & Time:" in line or "https://  " in line or re.match(r"\d+\s+of\s+\d+\s+\d+/\d+/\d+,\s+\d+:\d+\s+[AP]M", line):
                    continue
                if pattern.match(line):
                    if current_question:
                        questions.append(current_question.strip())
                    current_question = line.strip()
                else:
                    current_question += " " + line.strip()
            if current_question:
                questions.append(current_question.strip())

    return questions

# Extract questions from the PDF
questions = extract_questions_from_pdf('/LAB/blooms/final_model/daa_paper.pdf')  # path to your pdf file



# Filter out only the questions that start with an integer
filtered_questions = [question for question in questions if question[0].isdigit()]

# Create a dataframe
df = pd.DataFrame(filtered_questions, columns=['Questions'])

# Reset index starting from 1
df.index = df.index + 1

# Print the dataframe
print(df)
df.to_csv('/LAB/blooms/final_model/extracted_questions.csv', index_label='Question_Number')


                                            Questions
1   1 Given two algorithms, Algorithm X and Algori...
2   2 Arrange the following functions in ascending...
3   3 What is the time complexity of the recursive...
4   4 Consider a minimization problem where findin...
5   5 Which of the following is the correct equati...
6   6 Suppose you have candidate set C with coins ...
7   7 State true or false: Kruskal's algorithm can...
8   8 State True or false: The Bellman-Ford algori...
9   9 Choose the correct option for the following ...
10  10 Assume that the algorithms considered here ...
11  11 Let X be a problem that belongs to the clas...
12  12 State true or false: In huffman coding, The...
13  13 How many spanning trees does the given grap...
14  14 How many comparisons are needed to sort an ...
15  15 Consider the below table for jobs given wit...
16  16 What is the basic principle in Rabin Karp a...
17  17 What is recurrence and time complexity for ...
18  18 State Master Theorem.

In [3]:
import pandas as pd
import re

pd.set_option('display.max_colwidth', None)
df=pd.read_csv(r"/LAB/blooms/final_model/extracted_questions.csv",index_col=0)


In [4]:
def remove_text_after_number(text):
    pattern = r'\((\d+)\)'  # Regular expression pattern to match a number in parentheses
    match = re.search(pattern, text)

    if match:
        end_index = match.end()  # Get the index of the closing parenthesis
        return text[:end_index]  # Return the part of the string before the closing parenthesis
    else:
        return text

df['Questions'] = df['Questions'].apply(lambda x: remove_text_after_number(x))

In [5]:
df['Questions']

Question_Number
1      1 Given two algorithms, Algorithm X and Algorithm Y, with timecomplexities O(2^n) and O(n!), respectively, which of the followingstatements is true? 1) Algorithm Y is more efficient than Algorithm X for small input sizes, but Algorithm X becomes more efficient for larger input sizes.2) Algorithm X is more efficient than Algorithm Y for all input sizes.3) Algorithm Y is more efficient than Algorithm X for all input sizes.4) Algorithm X is more efficient than Algorithm Y for small input sizes, but Algorithm Y becomes more efficient for larger input sizes.(1)
2                                                                                                                                                                                                                                                                                                                                                                                        2 Arrange the following functions in 

In [6]:
def merge_rows_until_pattern(df, column):

    pattern = r'\(([1-9])\)'
    rows_to_merge = []
    merged_sentences = []

    for index, row in df.iterrows():
        sentence = row[column]

        # Check if the pattern is found in the sentence
        if re.search(pattern, sentence):
            # If we have sentences to merge, combine them and append to merged_sentences
            if rows_to_merge:
                merged_sentences.append(' '.join(rows_to_merge + [sentence]))
                rows_to_merge = []  # Reset rows_to_merge
            else:
                merged_sentences.append(sentence)
        else:
            # If the pattern is not found, add the sentence to rows_to_merge
            rows_to_merge.append(sentence)

    # If there are any remaining rows in rows_to_merge, combine and append
    if rows_to_merge:
        merged_sentences.append(' '.join(rows_to_merge))

    # Create a new DataFrame with the merged sentences
    merged_df = pd.DataFrame({column: merged_sentences})
    return merged_df

merged_df = merge_rows_until_pattern(df, 'Questions')
# Function to extract marks from each row


In [7]:
merged_df = merge_rows_until_pattern(df, 'Questions')
# Function to extract marks from each row
def extract_marks(row):
    # Using regular expression to find the marks in parentheses
    matches = re.findall(r'\((\d+)\)$', row['Questions'])

    # If matches are found, return the marks
    if matches:
        return int(matches[0])  # Assuming there is only one match, convert to int
    else:
        return None
extracted_marks = pd.DataFrame()
extracted_marks['Marks']= merged_df.apply(extract_marks, axis=1)

def remove_last_number(sentence):
    return re.sub(r'\(\d+\)', '', sentence)

merged_df['Questions'] = merged_df['Questions'].apply(remove_last_number)

final_df=pd.concat([merged_df, extracted_marks], axis=1)

final_df.to_csv('/LAB/blooms/final_model/extracted_questions_marks.csv',index=False)  # extracted Question & marks

In [8]:
final_df

Unnamed: 0,Questions,Marks
0,"1 Given two algorithms, Algorithm X and Algorithm Y, with timecomplexities O(2^n) and O(n!), respectively, which of the followingstatements is true? 1) Algorithm Y is more efficient than Algorithm X for small input sizes, but Algorithm X becomes more efficient for larger input sizes.2) Algorithm X is more efficient than Algorithm Y for all input sizes.3) Algorithm Y is more efficient than Algorithm X for all input sizes.4) Algorithm X is more efficient than Algorithm Y for small input sizes, but Algorithm Y becomes more efficient for larger input sizes.",1
1,"2 Arrange the following functions in ascending order of theirgrowth. F1(n)=2n, F2(n)=n3/2,F3(n)=nlog 2n, F4(n)=nlog2n 1) F3, F2, F1, F4 2) F2, F3, F1, F4 3) F2, F3, F4, F1 4) F3, F2, F4, F1",2
2,3 What is the time complexity of the recursive implementation used tofind the nth Fibonacci term? 1) Linear 2) Polynomial 3) Exponential 4) None of the above,1
3,4 Consider a minimization problem where finding the optimal solutionis computationally infeasible. Which of the following statementsabout approximation algorithms is true? 1) Approximation algorithms always guarantee the optimal solution.2) Approximation algorithms provide a solution that is arbitrarily close to the optimal solution.3) Approximation algorithms trade optimality for efficiency by providing a solution that is guaranteed to be within a certain factor of the optimal solution.4) Approximation algorithms are only used for maximization problems.,1
4,"5 Which of the following is the correct equation for the matrix chainmultiplication problem where mat[i-1] * mat[i] gives the dimensionof the ith matrix? 1) dp[i,j] = 1 if i=j dp[i,j] = min{dp[i,k] + dp[k+1,j]}2) dp[i,j] = 0 if i=j dp[i,j] = min{dp[i,k] + dp[k+1,j]} + mat[i- 1]*mat[k]*mat[j].3) dp[i,j] = 0 if i=j dp[i,j] = min{dp[i,k] + dp[k+1,j]}4) dp[i,j] = 1 if i=j dp[i,j] = min{dp[i,k] + dp[k+1,j]} + mat[i- 1]*mat[k]*mat[j]",1
5,"6 Suppose you have candidate set C with coins of differentdenominations and you want to find the change of an amount N. Youhave an infinite supply of each of coins in C. According to greedyalgorithm, which of the following options, with values of C and K,will NOT produce an optimal answer? 1) C=(1,3,4} and N=6 2) C=(1,4,9} and N=10 3) C=(1,3,8} and N=12 4) C=(1,3,4} and N=100",1
6,7 State true or false: Kruskal's algorithm can work for undirectedgraph only for finding MST. 1) true 2) false,1
7,"8 State True or false: The Bellman-Ford algorithm indicates whetherthere is a negative-weight cycle that is reachable from the source.If there is such a cycle, the algorithm indicates that no solutionexists. 1) true 2) false",1
8,9 Choose the correct option for the following table: Algorithm Design approach A. Huffman coding i. Dynamic Programming B.Bellman Ford ii. Backtracking,1
9,"10 Assume that the algorithms considered here sort the input sequencesin ascending order. If the input is already in ascending order,which of the following are TRUE? 1) Quicksort runs in Θ(n^2) time2) Bubble sort runs in Θ(n^2) time3) Mergesort runs in Θ(n) time4) Insertion sort runs in Θ(n) time",1


In [9]:
import tensorflow as tf

In [10]:
!pip install transformers

You should consider upgrading via the '/usr/bin/python -m pip install --upgrade pip' command.[0m


In [11]:
blooms_model = tf.keras.models.load_model('/LAB/blooms/blooms_model.pkl') #Load the model

2024-03-16 09:58:59.969338: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 18813 MB memory:  -> device: 0, name: NVIDIA RTX A5000, pci bus id: 0000:3b:00.0, compute capability: 8.6
2024-03-16 09:58:59.970677: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 22118 MB memory:  -> device: 1, name: NVIDIA RTX A5000, pci bus id: 0000:af:00.0, compute capability: 8.6


In [12]:
import numpy as np
import pandas as pd
from transformers import BertTokenizer
import tensorflow as tf

# Tokenize Question
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256,
        truncation=True,
        padding='max_length',
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def make_prediction(model, processed_data, classes=['remember', 'understand', 'apply', 'analyze', 'evaluate','create']):
    probs = model.predict(processed_data)[0]
    return classes[np.argmax(probs)], np.argmax(probs) + 1  

def read_questions_from_csv(filename):
    df = pd.read_csv(filename)
    questions = df['Questions'].tolist()  
    return df, questions

def calculate_question_paper_quality(df):
    full_marks = df['Marks'].sum()  
    df['Encoded_Blooms_Level'] = df['Encoded_Blooms_Level'].astype(float)
    df['Question_Weightage'] = df['Marks'].astype(float)
    df['Question_Paper_Quality'] = (df['Encoded_Blooms_Level'] * df['Question_Weightage']) / full_marks
    return df['Question_Paper_Quality'].sum()




In [13]:
# !pip install --upgrade jupyter ipywidgets
# !pip install ipywidgets
# !jupyter nbextension enable --py widgetsnbextension  # if needed

In [14]:
def categorize_difficulty(difficulty_score):
    if difficulty_score <= 1.5:
        return "Easy"
    elif 1.5 < difficulty_score <= 3.5:
        return "Medium"
    else:
        return "High"

csv_filename = '/LAB/blooms/final_model/extracted_questions_marks.csv'
df, questions = read_questions_from_csv(csv_filename)

predicted_blooms_levels = []
predicted_blooms_encoded = [] 
for i, question in enumerate(questions):
    processed_data = prepare_data(question, tokenizer)
    result, encoded_value = make_prediction(blooms_model, processed_data=processed_data)
    predicted_blooms_levels.append(result)
    predicted_blooms_encoded.append(encoded_value)

df['Predicted_Blooms_Level'] = predicted_blooms_levels
df['Encoded_Blooms_Level'] = predicted_blooms_encoded  

output_csv_filename = 'output_final.csv'  
df[['Questions', 'Predicted_Blooms_Level', 'Encoded_Blooms_Level']].to_csv(output_csv_filename, index=False)

question_paper_quality = calculate_question_paper_quality(df)

difficulty_code = categorize_difficulty(question_paper_quality)

df.to_csv(output_csv_filename, index=False)

print(f"Predicted blooms levels along with questions have been stored in {output_csv_filename}")
print(f"Score of the question paper is: {question_paper_quality}")
print(f"The difficulty  of the question paper is: {difficulty_code}")


2024-03-16 09:59:11.164056: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2024-03-16 09:59:13.100614: I tensorflow/stream_executor/cuda/cuda_blas.cc:1760] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Predicted blooms levels along with questions have been stored in output_final.csv
Score of the question paper is: 3.4444444444444438
The difficulty  of the question paper is: Medium
