Dependencies

In [13]:
import torch 
from transformers import (
    BertForQuestionAnswering,
    BertTokenizerFast
)

from scipy.special import softmax
import plotly.express as px
import pandas as pd
import numpy as np 
from PyPDF2 import PdfReader
import os

In [3]:
model_name = "deepset/bert-base-cased-squad2"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
def extract_text_from_pdf(url: str) -> str:
    # creating a pdf reader object
    reader = PdfReader(url)
    
    # printing number of pages in pdf file
    text = ''    
    for page_num in range(len(reader.pages)):
        # Extract text from the current page
        page = reader.pages[page_num]
        text += page.extract_text()
    return text


In [4]:
def predict_answer(context, question):
    inputs = tokenizer(question, context, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    start_scores, end_scores = softmax(outputs.start_logits)[0], softmax(outputs.end_logits)[0]
    start_idx = np.argmax(start_scores)
    end_idx = np.argmax(end_scores)
    confidence_score = (start_scores[start_idx] + end_scores[end_idx]) /2
    answer_ids = inputs.input_ids[0][start_idx: end_idx + 1]
    answer_tokens = tokenizer.convert_ids_to_tokens(answer_ids)
    answer = tokenizer.convert_tokens_to_string(answer_tokens)
    if answer != tokenizer.cls_token:
        return answer, confidence_score
    return None, confidence_score

In [7]:
context = extract_text_from_pdf('../data/raw/dataset/Xalaxian_Astral_Projection_Techniques.pdf')

In [9]:
predict_answer(context, "How can be achieved the astral projection?")

('by manipulating space - time', 0.677298903465271)

In [12]:
predict_answer(context, "Does the astral projection techniques could be use to explore alternate universes?")

('could be used to explore and interact with alternate universes',
 0.22129474580287933)

In [15]:
def group(sentences, group_size, stride):
    groups = []
    num_sentences = len(sentences)
    for i in range(0, num_sentences, group_size - stride):
        chunk = sentences[i: i + group_size]
        groups.append(chunk)
    return groups

In [14]:
base_path = '../data/raw/dataset/'
content = []
with os.scandir(base_path) as entries:
    for entry in entries:
        if entry.name.endswith(".pdf"):
            text = extract_text_from_pdf(base_path+entry.name)
            content.append(text)
        elif entry.name.endswith(".txt"):
            id = lambda x: x
            file = open(base_path+entry.name,"r+", encoding="utf8")
            lines = file.readlines()
            text = ' '.join(lines)
            content.append(text)
            file.close()
            
dataset = {'document': content}
df = pd.DataFrame(dataset)
df

Unnamed: 0,document
0,File Name: Xalaxian Advanced Materials Science...
1,File Name: Xalaxian Astral Projection Techniqu...
2,File Name: Xalaxian_Aurora-Equivalent_Energy_E...
3,File Name: Xalaxian_Aurora-like_Phenomena.pdf\...
4,File Name: Xalaxian Binary Star Energy Harvest...
...,...
99,File Name: Xalaxian Tranquil Energy Expanse Re...
100,File Name: Xalaxian_Twilight_Adaptation_Mechan...
101,File Name: Xalaxian_Twilight_Sky_Phenomena.pdf...
102,File Name: Xalaxian Universal Ethics Framework...


In [17]:
grouped_sentences = group(content, group_size=3, stride=1)

In [18]:
questions = ["Who are the Xalaxians?", "How can be achieved the astral projection?", "Is it possible to control climate?", "What is Comunal Memory?"]
answers = {}

In [25]:
def get_answers(grouped_sentences, questions):
    answers = {}
    for group in grouped_sentences:
        context = "\n".join(group)
        for question in questions:
            answer, score = predict_answer(context, question)
            if answer:
                print(answer)
                if question not in answers:
                    answers[question] = (answer, score)
                else:
                    if score > answers[question][1]:
                        answers[question] = (answer, score)
    return answers

In [26]:
get_answers(grouped_sentences, questions)

[CLS] How can be achieved the astral projection? [SEP] File Name : Xalaxian _ Aurora - Equivalent _ Energy _ Expressions. pdf Description : This document outlines the Xalaxian equivalent of Aurora energy expressions, as well as their uses in interstellar travel. Data : - The Xalaxian Aurora - equivalent energy expression is a form of hyper - dimensional energy manipulation, allowing for instantaneous access to distant locations in the universe. - The energy is generated from a particular type of crystal, known as a ‘ transdimensic ’ crystal, and is capable of creating a ‘ warp tunnel ’
[CLS] How can be achieved the astral projection? [SEP] File Name : Xalaxian Binary Star Energy Harvesting Description : This document explores the Xalaxians ' energy harvesting techniques, specifically in regards to binary star systems. Data : - The Xalaxians have developed a method of harvesting energy from binary star systems, utilizing the stars ' radiation to fuel their interstellar vessels and power

{'How can be achieved the astral projection?': ('energy manipulation',
  0.5457387566566467),
 'Who are the Xalaxians?': ('a species of advanced extraterrestrials',
  0.9828792810440063),
 'Is it possible to control climate?': ('weather control',
  0.41645583510398865),
 'What is Comunal Memory?': ('Group consciousness', 0.7157465815544128)}