This code is a part of 3 blog series on using Gemma 2b on Android for a project called SciGemma.

Check out the detailed blog about the code here: https://medium.com/p/70abdc98abf0/edit

In [None]:
# Imports
import torch
import pandas as pd
from pdfminer.high_level import extract_text
from transformers import pipeline
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from pypdf import PdfReader

In [None]:
def pdf_to_qa_dataset(pdf_path, output_csv_path):
  """
  Converts a PDF to a Q&A dataset and saves it as a CSV file.

  Args:
    pdf_path: Path to the PDF file.
    output_csv_path: Path to save the CSV file.
  """

  text = ""

  # Load the PDF file

  reader = PdfReader(pdf_path)

  # Iterate through each page in the PDF
  for page in reader.pages:
      # Extract text from the current page
      page_text = page.extract_text()
      # Append the extracted text to the all_text variable
      text += page_text + "\n"

  # text = extract_text(pdf_path) # original way to read pdf

  # Load a pre-trained question answering model and tokenizer
  model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"  # Or another suitable model
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForQuestionAnswering.from_pretrained(model_name)
  qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
  qa_pairs = []

  # Split the text into chunks (adjust chunk size as needed)
  for i in range(0, len(text), 500):
    chunk = text[i:i+500]

    # Use the model to predict possible questions and answers
    inputs = tokenizer(chunk, return_tensors="pt")
    outputs = model(**inputs)
    start_logits = outputs.start_logits
    end_logits = outputs.end_logits

    # Get the most likely question and answer
    start_index = torch.argmax(start_logits)
    end_index = torch.argmax(end_logits)
    question = tokenizer.decode(inputs["input_ids"][0][start_index:end_index+1])
    if not question:
      continue
    # Now use the qa_pipeline to get the answer for the generated question
    answer = qa_pipeline(question=question, context=chunk)['answer']

    qa_pairs.append({"question": question,
                     "answer": answer,
                     "context": chunk})

  # Create a Datasets.Dataset object
  qa_dataset = Dataset.from_list(qa_pairs)
  qa_dataset.to_csv(output_csv_path, index=False)
  return qa_dataset

In [None]:
# Example usage
pdf_path = "/content/iesc101.pdf"
output_csv_path = "qa_dataset.csv"  # Choose your desired filename
pdf_to_qa_dataset(pdf_path, output_csv_path)

In [None]:
df = pd.read_csv('qa_dataset.csv')
df.head()

In [None]:
# Load the CSV file
csv_file_path = '/content/Science_data.csv'  # Update this to your CSV file path
df = pd.read_csv(csv_file_path)

# Convert the DataFrame to JSON Lines and save it
jsonl_file_path = 'science_dataset_class9.jsonl'  # Update this to your desired output file path
df.to_json(jsonl_file_path, orient='records', lines=True)