# Imports

In [None]:
!sudo apt-get install poppler-utils
!sudo apt install tesseract-ocr-eng

In [None]:
!pip install pytesseract
!pip install pdf2image
!pip install transformers
!pip install bertviz
!pip install urllib3
!pip install sent2vec

In [None]:
# Restart kernel after running above cells

In [None]:
import numpy as np
import cv2
from PIL import Image as Img
from PIL import ImageDraw as Imgd
from PIL import ImageColor
from pdf2image import convert_from_path, convert_from_bytes
from IPython.display import display, Image
import pytesseract
import os
import shutil

In [None]:
import torchvision.ops.boxes as bops
import torch
import pandas as pd
import time
import math

import nltk
nltk.download('punkt')
nltk.download('all')
from nltk.corpus import brown
from nltk.tokenize import sent_tokenize

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

In [None]:
import torch
from bertviz import model_view

In [None]:
from sent2vec.vectorizer import Vectorizer
vectorizer = Vectorizer()

In [None]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import string

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
import matplotlib
matplotlib.use('Agg')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install pandas python-docx

# Text Extraction from PDFs using Whitespace Algorithm

In [None]:
#setting up tesseract for local systems
# tess_path = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
# pytesseract.pytesseract.tesseract_cmd = tess_path
# print(pytesseract.get_languages(config='--tessdata-dir "C:/Program Files/Tesseract-OCR/tessdata"'))

In [None]:
# function to segment rows
def rows(hsv,row,hsv_img,pg_no):
    st1=""
    st2=""
    f_st1=""
    f_st2=""
    whitespace = [[0,0,0]]
    p=0
    start=0
    for i in range(row):
        c=0
        for element in hsv[i]: #hsv[i] is a single row of pixels
            if element!=255: #checking for black pixel (hsv value for white = 255)
                c=1
                break
        if c==0:
            p+=1
        else:
            p+=1
            whitespace.append([p,start,i]) #storing position of whitespaces
            p=0
            c=0
            start=i
    whitespace.remove([0,0,0])
    whitespace_1 = [[0,0,0]]
    for ele,spos,epos in whitespace:
        if ele!=1:
            whitespace_1.append([ele, spos, epos]) #removing row of height=1
    whitespace_1.remove([0,0,0])
    l = (len(whitespace_1))
    tex = []
    for i in range(l-1):
        sp = whitespace_1[i][2] #storing start of text block
        ep = whitespace_1[i+1][1] #storing end of text block
        h = hsv[sp:ep]
        tex.append([sp,ep])
        hsv1 = Img.fromarray(h)
        width,height = hsv1.size
        #print(width,height)
        #display(hsv1)
        if height>0 and width>0:
            st1,st2=cols(hsv1,h,height,width,pg_no) #detecting columns and hence text
            f_st1=f_st1+st1 #storing single column and left hand column strings
            f_st2=f_st2+st2 #storing only right hand column strings because no case of single column at end of page has been seen
            #print(st1)
            #print(st2)
    f_st = f_st1+f_st2 #combining text obtained from both columns
    return f_st

In [None]:
# function to segment columns
def cols(hsv_img,hsv,height,width,pg_no):
    mid=int(width/2)
    element=255
    sp=mid
    ep=mid
    st1 = ""
    st2= ""
    c1=0 #counter to break loop when black pixel has been detected on left side
    c2=0 #same function as c1 but for right side
    for i in range(mid):
        p1 = hsv[:,mid-i] #column left of mid
        p2 = hsv[:,mid+i] #column right of mid
        if c1==0:
            for element in p1: #checking for black pixel on left side
                if element!=255:
                    sp=mid-i
                    c1=1
                    break
        if c2==0:
            for element in p2: #checking for black pixel on left side
                if element!=255:
                    ep=mid+i
                    c2=1
                    break
        if c1==1 and c2==1:#to break outermost loop when columns with black pixel on both sides have been detected
            break
    if (ep-sp)>40:#check for column break
        #print("1")
        hsv1 = hsv[:,0:sp] #croping image for left column
        hsv2 = hsv[:,ep:width] #croping image for right column
        img1 = Img.fromarray(hsv1)
        img2 = Img.fromarray(hsv2)
        st1 = detect_text(img1)#detect text in left cloumn
        st2 = detect_text(img2)#detect text in right column
    else:#if not single column text
        #print("2")
        st1 = detect_text(hsv_img)
    return st1,st2

In [None]:
#detect text in passed image
def detect_text(hsv_img):
  #print("in text")
  resized_block = hsv_img.resize((hsv_img.width * 4, hsv_img.height * 4))
  text = pytesseract.image_to_string(resized_block, lang='eng',config= '--psm 3 --oem 1')
  return text

In [None]:
article_dict = {'id':[],
                'text':[],
                'title':[],
                'abstract':[]}

In [None]:
text_corpora = ""

In [None]:
def text_extraction():
  global article_dict, text_corpora
  pdf_dir = '/content/drive/MyDrive/UG_ML/Dataset'
  pdf_file_no=1
  for filename in os.listdir(pdf_dir):
    if filename.endswith('.pdf'):
      print(os.path.join(pdf_dir, filename))
      pdf_file = os.path.join(pdf_dir, filename)
      images = convert_from_path(pdf_file)
      text = ""
      pg_no=0
      for image in images:
        pg_no+=1
        img_array = np.array(image)
        hsv = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
        #print(hsv)
        hsv_img = Img.fromarray(hsv)
        #display(hsv_img)
        row = np.size(hsv, axis=0) #calculating rows
        col = np.size(hsv, axis=1) #calculating columns
        tex = rows(hsv,row,hsv_img,pg_no) #pagewise text extraction
        #print(tex)
        text=text+tex #store text as extracted page by page
      text_corpora = text_corpora + text
      article_dict['id'].append(pdf_file_no)
      article_dict['text'].append(text)
      article_dict['title'].append("testing")
      article_dict['abstract'].append("testing")
      pdf_file_no += 1
    # only for testing purpose
    if(pdf_file_no > 1):
      break

In [None]:
# calling the function
text_extraction()

/content/drive/MyDrive/UG_ML/Dataset/Hydrolytic degradation of composites of poly(L-lactide-co-ɛ-caprolactone) 70-30 and β-tricalcium phosphate.pdf


In [None]:
text_corpora

'Journal of Biomaterials “Applications\n\nhttp://jba.sagepub.com/\n\x0c\x0cHydrolytic degradation of composites of poly(L-lactide-co-¢-caprolactone) 70/30 and £-tricalcium\nphosphate\n\x0cNiina Ahola, Minna Veiranto, Jaana Rich, Alexander Efimov, Markus Hannula, Jukka Seppala and Minna Kellomaki\nJ Biomater App! published online 9 October 2012\n\x0c\x0c\x0chttp://jba.sagepub.com/content/early/2012/09/14/0885328212462258\n\x0cA more recent version of this article was published on - Oct 11, 2012\n\x0c\x0cPublished b\n\n6SAGE\n\x0chtto://www.sagepublications.com\n\x0cAdditional services and information for Journal of Biomaterials Applications can be found at:\n\x0cEmail Alerts: http://jba.sagepub.com/cagji/alerts\n\x0cSubscriptions: http://jba.sagepub.com/subscriptions\n\x0cReprints: http:/Awww.sagepub.com/journalsReprints.nav\n\x0cPermissions: http://www.sagepub.com/journalsPermissions.nav\n\x0cOnlineFirst Version of Record - Oct 11, 2012\n\x0c\x0c\x0c\x0c\x0cArticle\n\x0c\x0c\x0c(C) The

In [None]:
article_dict

# Corpus Generation

In [None]:
words = brown.words()

print(words[:100])
sentences = brown.sents()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]


In [None]:
def correct_text(input_text):
    words = nltk.word_tokenize(input_text.lower())
    corrected_words = []

    for word in words:
        if word not in brown.words():
            corrected_words.append("[ERR]")
        else:
            corrected_words.append(word)

    corrected_text = " ".join(corrected_words)
    return corrected_text

In [None]:
def is_number_token(token):
    try:
        float(token)
        return True
    except ValueError:
        return False

In [None]:
def preprocess(text):
  text = text.replace('“', '"').replace('”', '"').replace('‘', "'").replace('’', "'")

  # Remove string expressions like [1], [1,2,3,4], etc.
  cleaned_text = re.sub(r'\[\d+(,\s*\d+)*\]', '', text)
  #print(cleaned_text)

  # Tokenize the cleaned text into words
  tokens = word_tokenize(cleaned_text)

  # Convert words to lowercase
  lowercase_tokens = [token.lower() for token in tokens]

  # Remove stopwords and punctuation only tokens
  stop_words = set(stopwords.words('english'))
  punctuations = set(string.punctuation)

  filtered_tokens = [token for token in tokens if token.lower() not in stop_words and token.lower() not in punctuations and not is_number_token(token) and len(token) > 1]

  # Lemmatize the tokens
  lemmatizer = WordNetLemmatizer()
  lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

  return lemmatized_tokens

In [None]:
final_tokens0 = preprocess(text_corpora)
print(final_tokens0)

['Journal', 'Biomaterials', '``', 'Applications', 'http', '//jba.sagepub.com/', 'Hydrolytic', 'degradation', 'composite', 'poly', 'L-lactide-co-¢-caprolactone', '70/30', '£-tricalcium', 'phosphate', 'Niina', 'Ahola', 'Minna', 'Veiranto', 'Jaana', 'Rich', 'Alexander', 'Efimov', 'Markus', 'Hannula', 'Jukka', 'Seppala', 'Minna', 'Kellomaki', 'Biomater', 'App', 'published', 'online', 'October', 'http', '//jba.sagepub.com/content/early/2012/09/14/0885328212462258', 'recent', 'version', 'article', 'published', 'Oct', 'Published', '6SAGE', 'htto', '//www.sagepublications.com', 'Additional', 'service', 'information', 'Journal', 'Biomaterials', 'Applications', 'found', 'Email', 'Alerts', 'http', '//jba.sagepub.com/cagji/alerts', 'Subscriptions', 'http', '//jba.sagepub.com/subscriptions', 'Reprints', 'http', '/Awww.sagepub.com/journalsReprints.nav', 'Permissions', 'http', '//www.sagepub.com/journalsPermissions.nav', 'OnlineFirst', 'Version', 'Record', 'Oct', 'Article', 'Author', 'Hydrolytic', 'd

In [None]:
file_path = 'Material_properties.txt'  # Replace with the actual path of your text file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()
final_tokens1 = preprocess(text)
print(final_tokens1)

In [None]:
file_path = 'Biomaterials_corpus.txt'  # Replace with the actual path of your text file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()
final_tokens2 = preprocess(text)
print(final_tokens2)

In [None]:
urls = ["https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-a/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-b/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-c/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-d/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-e/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-e/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-f/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-g/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-h/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-i/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-j/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-k/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-l/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-m/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-n/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-o/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-p/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-q/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-r/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-s/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-s/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-s/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-t/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-u/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-v/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-w/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-x/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-x/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-y/","https://www.dierk-raabe.com/glossary-of-materials-science/materials-science-glossary-z/"]

# Variable to store the scraped text
scraped_text = ""

# Iterate over the URLs
for url in urls:
    # Send an HTTP GET request to the webpage
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content of the webpage
        soup = BeautifulSoup(response.content, "html.parser")
        paragraphs = soup.find_all("p")

        # Extract the text from the HTML elements
        scraped_text += "\n".join([p.get_text() for p in paragraphs]) + "\n"
    else:
        # Print an error message if the request was unsuccessful
        print("Failed to retrieve the webpage. Status code:", response.status_code)

In [None]:
final_tokens3 = preprocess(scraped_text)
print(final_tokens3)

['Ab-initio', 'lat', '``', 'beginning', "''", 'parameter', 'free', 'method', 'based', 'quantum', 'mechanic', 'electrodynamics', 'solely', 'universal', 'fundamental', 'physical', 'constant', 'needed', 'term', 'ab', 'initio', 'may', 'used', 'different', 'context', 'natural', 'science', 'technical', 'point', 'view', 'following', 'calculation', 'said', '``', 'ab', 'initio', "''", '``', 'first', 'principle', "''", 'relies', 'basic', 'established', 'law', 'nature', 'without', 'additional', 'assumption', 'special', 'model', 'contrast', 'practice', 'according', 'measurement', 'difference', 'ab', 'initio', 'calculation', 'experimental', 'result', 'may', 'show', 'mistake', 'experimental', 'set', 'may', 'deliver', 'accurate', 'determination', 'influencing', 'parameter', 'display', 'far', 'unknown', 'effect', 'use', 'measured', 'value', 'called', 'semi-empirical', 'calculation', 'chemistry', 'two', 'field', 'use', 'term', 'ab', 'initio', 'synthesis', 'mean', 'production', 'chemical', 'compound', '

# Relevancy Score using Bert

In [None]:
def test_function():
  # Set the folder path containing the PDFs
  folder_path = '/content/drive/MyDrive/UG_ML/Dataset'

  # Initialize an empty dataframe
  df = pd.DataFrame(columns=['PDF', 'Abstract'])

  # Loop through each PDF in the folder
  for filename in os.listdir(folder_path):
      if filename.endswith('.pdf'):
          pdf_path = os.path.join(folder_path, filename)

          # Convert the first page of the PDF to an image
          images = convert_from_path(pdf_path, first_page=1, last_page=2)
          image_path = '/content/first_page.jpg'
          images[0].save(image_path, 'JPEG')
          time.sleep(1)

          # Perform OCR on the image
          extracted_text = pytesseract.image_to_string(Img.open(image_path))

          # Find the abstract section on the first page
          start_index = extracted_text.find('Abstract') + len('Abstract')
          end_index = extracted_text.find('Introduction')

          if start_index == -1 or end_index == -1:
              # Abstract and/or Introduction not found on first page
              # Find the abstract section on the second page
              extracted_text = pytesseract.image_to_string(images[1])

              start_index = extracted_text.find('Abstract') + len('Abstract')
              end_index = extracted_text.find('Introduction')

              if start_index == -1 or end_index == -1:
                  # Abstract and/or Introduction not found on second page
                  abstract = 'NaN'
              else:
                  # Extract abstract from the second page
                  abstract = extracted_text[start_index:end_index].strip()
          else:
              # Extract abstract from the first page
              abstract = extracted_text[start_index:end_index].strip()

          # Add the abstract and PDF name to the dataframe
          df = df.append({'PDF': filename, 'Abstract': abstract}, ignore_index=True)
          return df

In [None]:
data_df = test_function()
#data_df = pd.DataFrame({'PDF': article_dict['title'], 'Abstract': article_dict['abstract']})

  df = df.append({'PDF': filename, 'Abstract': abstract}, ignore_index=True)


In [None]:
# These tags are taken from the corpus generation part
list_of_tags = ['Increasing', 'importance', 'placed', 'hydroxyapatite-derived', 'scaffold', 'bone', 'tissue', 'regeneration', 'application', 'alternative', 'bone', 'graft', 'Insufficient', 'worldwide', 'donor', 'potential', 'risk', 'disease', 'transmission', 'affirm', 'autograft', 'allograft', 'sustainable', 'approach', 'bone', 'substitute', 'Furthermore', 'hydroxyapatite', 'biological', 'similarity', 'bone', 'tissue', 'well', 'abundantly', 'available', 'offering', 'environmentally', 'friendly', 'solution', 'biomedical', 'chemical', 'material', 'ceramic', 'metal', 'polymer', 'bio', 'reaction', 'composite']

In [None]:
labeled_data = []

In [None]:
counter = 0
for abstract in data_df['Abstract']:
  print(counter)
  counter = counter + 1
  # if counter < 10:
  #   continue
  # else :
  sentences = sent_tokenize(abstract)
  for sentence in sentences:
    if any(word in sentence for word in list_of_tags):
        label = 1
    else:
        label = 0

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    input_ids = tokenizer.encode(sentence, add_special_tokens=True, max_length=31, pad_to_max_length=True)
    labeled_data.append((input_ids, label))

0


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

In [None]:
data_df

Unnamed: 0,PDF,Abstract
0,Hydrolytic degradation of composites of poly(L...,There is an increasing need for synthetic bone...


In [None]:
inputs = torch.tensor([x[0] for x in labeled_data])
labels = torch.tensor([x[1] for x in labeled_data])
dataset = TensorDataset(inputs, labels)
sampler = RandomSampler(dataset)
dataloader = DataLoader(dataset, sampler=sampler, batch_size=32)

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, output_attentions=True)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
for epoch in range(10):
    running_loss = 0.0
    for i, batch in enumerate(dataloader):

        optimizer.zero_grad()

        loss = model(batch[0], labels=batch[1])[0]

        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f'Epoch {epoch + 1}, Loss: {running_loss / len(dataloader)}')

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch 1, Loss: 1.1492286920547485
Epoch 2, Loss: 0.6559097170829773
Epoch 3, Loss: 0.347515732049942
Epoch 4, Loss: 0.18420419096946716
Epoch 5, Loss: 0.12971243262290955
Epoch 6, Loss: 0.11232143640518188
Epoch 7, Loss: 0.09312210232019424
Epoch 8, Loss: 0.08443690091371536
Epoch 9, Loss: 0.08028579503297806
Epoch 10, Loss: 0.07591656595468521


In [None]:
prediction_list = []
for abstract in data_df['Abstract']:
  predicted_labels = []
  prediction_score = []
  all_inputs=[]
  t_sum = 0
  f_sum = 0
  sentences = sent_tokenize(abstract)
  for sentence in sentences:

    input_ids = tokenizer.encode(sentence, add_special_tokens=True)


    inputs = torch.tensor(input_ids).unsqueeze(0)


    with torch.no_grad():
        outputs = model(inputs)
        _, predicted_label = torch.max(outputs[0], 1)

    all_inputs.append(inputs)
    predicted_labels.append(predicted_label.item())
    if(predicted_label.item() == 1):
      t_sum += 1
    else:
      f_sum += 1
  score = t_sum/(t_sum + f_sum)
  prediction_score.append(score)
  prediction_list.append(predicted_labels)

In [None]:
data_df['p_score'] = prediction_score

In [None]:
data_df

Unnamed: 0,PDF,Abstract,p_score
0,Hydrolytic degradation of composites of poly(L...,There is an increasing need for synthetic bone...,1.0


In [None]:
vectorizer.run(list(data_df['PDF']))
vectors = vectorizer.vectors

In [None]:
df4 = pd.DataFrame({'PDF': data_df['PDF'], 'Abstract': data_df['Abstract'], 'title_embedding': vectors})

In [None]:
df4

Unnamed: 0,PDF,Abstract,title_embedding
0,Hydrolytic degradation of composites of poly(L...,There is an increasing need for synthetic bone...,"[-0.8789095, -0.047567263, -0.21365784, -0.260..."


In [None]:
# df4['title_embedding'] = df4['title_embedding'].apply(eval).apply(np.array)

In [None]:
search_term = input('Enter a search term: ')

Enter a search term: hydroxyapatite scaffold


In [None]:
vectorizer.run(list(search_term))
search_vector = vectorizer.vectors

In [None]:
search_embeddings = search_vector[-1]

In [None]:
from numpy.linalg import norm

df4["similarities"] = df4['title_embedding'].apply(lambda x: np.dot(x,search_embeddings)/(norm(x)*norm(search_embeddings)))
data_df['sim_score'] = df4['similarities']

In [None]:
data_df

Unnamed: 0,PDF,Abstract,p_score,sim_score
0,Hydrolytic degradation of composites of poly(L...,There is an increasing need for synthetic bone...,1.0,0.696043


In [None]:
abstract_scores = []
for abstract in df4['Abstract']:
  vectorizer2 = Vectorizer()
  sentences = sent_tokenize(abstract)
  vectorizer.run(sentences)
  vectors = vectorizer.vectors
  abstract_score = [np.dot(x,search_embeddings)/(norm(x)*norm(search_embeddings)) for x in vectors]
  sum = (math.fsum(abstract_score))/len(abstract_score)
  abstract_scores.append(sum)

Initializing Bert distilbert-base-uncased
Vectorization done on cpu


In [None]:
data_df['abs_score'] = abstract_scores

In [None]:
ratio_1 = 1
ratio_2 = 4.5
ratio_3 = 4.5

In [None]:
data_df['final_score'] = ratio_1*data_df['p_score'] + ratio_2*data_df['sim_score'] + ratio_3*data_df['abs_score']

In [None]:
sorted_df = data_df.sort_values(by=['final_score'])
sorted_df.head()

Unnamed: 0,PDF,Abstract,p_score,sim_score,abs_score,final_score
0,Hydrolytic degradation of composites of poly(L...,There is an increasing need for synthetic bone...,1.0,0.696043,0.904119,8.200729


# Extracting testing data from relevant papers

In [None]:
def taging(lemm):
  tags = nltk.pos_tag(lemm)
  return tags

In [None]:
#creating sentence tokens
def sen_tok(sents):
  sentences = sent_tokenize(sents)
  r = 0
  final = []
  cd_sent = ""
  ref = ""
  s_tag=[]
  s_pos=[]
  for s in sentences:
    words = word_tokenize(s)
    s_tag = taging(words)
    s_pos = [pos for w, pos in s_tag]
    #separating sentences containing cardinal tags
    if 'CD' in s_pos and r!=1:
      cd_sent+= s
    #separating for references
    if 'References' in words or r==1:
      r=1
      ref+= s
    final+=words
  return ref, final, cd_sent

In [None]:
#lemmatization
def lemmat(filtered):
  lemma = WordNetLemmatizer()
  lemm = [lemma.lemmatize(word,pos="v") for word in filtered]
  return lemm

In [None]:
#removing stop words, lemmatizing filtered text
def preprocess(txt):
  stop_words = set(stopwords.words("english"))
  ref, sent_words, cd_sent = sen_tok(txt)
  print(stop_words)
  filtered = [word for word in sent_words if word.casefold() not in stop_words]
  print(filtered)
  lemm = lemmat(filtered)
  return lemm,ref,cd_sent

In [None]:
#filter lemmatized text from punctuations
def filter(txt):
  #punctuation = ['!','"','#','$','%','&','\'','(',')','*','+',',','-','.','/',':',';','<','=','>','?','@','[','\',']','^','_','`','{','|','}','~']
  import string
  spunct = string.punctuation
  lpunct = []
  lpunct[:0] = spunct
  lpunct+= '±'
  lpunct.remove('%')
  filtered = [tok for tok in txt if tok not in lpunct]
  return filtered

In [None]:
#take input and create filtered text for analysis
top5_input = ""
for i in range(6):
  if(i < len(article_dict['text'])):
    top5_input = top5_input + article_dict['text'][i]
text= []
t_card = ""
references = ""
pgtxt,ref,cd_sent = preprocess(top5_input)
t_card+=cd_sent
references+=ref
text+= pgtxt
f_text = filter(text)
print(pgtxt)
print(len(f_text))

{'your', 'ourselves', 'an', 'she', 'ma', 'hers', 'were', 'doesn', "mustn't", 'above', "hadn't", 'its', 'here', 'i', 'does', 'same', "couldn't", 'as', 'of', 'did', 'too', 'by', 'her', 'own', 'at', 'm', 'on', 'with', 'a', 'mustn', 'yours', 'all', 'them', 'more', 'is', 'o', 'shouldn', 'why', 'if', 'during', 'be', 'll', "aren't", 'ours', 'not', "should've", "you've", 'me', 'wouldn', 'he', 'few', "wasn't", 'having', 'and', "it's", "shan't", 'yourselves', 'because', 'up', 'over', 've', 'haven', 'mightn', "mightn't", 'those', "she's", 'for', 'whom', 'it', 'from', 'between', 'wasn', 'weren', 'the', 'about', 'aren', 'y', "needn't", 'what', 'then', 'other', "wouldn't", 'should', 'how', 'nor', 'himself', 'doing', 'themselves', 'him', 'have', 'you', 'won', "didn't", 'than', 'which', "shouldn't", 'this', 'are', 'don', 'itself', "haven't", 'herself', 'while', 'any', 'out', 'until', 'again', 'who', 'will', 'd', 'didn', 'further', 'very', 'been', 'so', 're', 'that', 's', 'theirs', 'am', 'most', 'hasn'

In [None]:
#craeting frequency distribution and displaying 30 most frequent words
import matplotlib.pyplot as plt
from nltk import FreqDist
frequency_distribution = FreqDist(f_text)
mc = frequency_distribution.most_common(30)
mc_n = [row[0] for row in mc]
tags = taging(mc_n)
f_mc = [word for word,pos in tags if not ( (len(word)<3) or (pos == 'CD'))]
freq = []
for row in mc:
  if row[0] in f_mc:
    freq.append(row[1])
print(f_mc,"\n",freq)
plt.figure(figsize=(10, 10))
plt.bar(f_mc, freq, color ='blue', width = 0.6)
plt.show()

['TCP', 'sample', 'PLCL', 'composites', 'degradation', 'weeks', 'test', 'vitro', 'use', 'polymer', 'phosphate', 'copolymer', 'time', 'poly', 'study', 'show', 'increase', 'content', 'water', 'Figure', 'B-TCP', 'mass'] 
 [66, 63, 61, 60, 58, 55, 48, 44, 43, 43, 39, 36, 34, 30, 29, 27, 26, 26, 25, 25, 24, 24]


In [None]:
#checkboxes for user to choose desired words
from ipywidgets import widgets, Layout
checkboxes = []
for word in f_mc:
  checkbox = widgets.Checkbox(description=word)
  checkboxes.append(checkbox)
widgets.GridBox(checkboxes, layout=Layout(
            width='50%',
            grid_template_rows='auto auto',
            grid_template_columns='25% 30% 25%'))

GridBox(children=(Checkbox(value=False, description='TCP'), Checkbox(value=False, description='sample'), Check…

In [None]:
#extract output obtained from checkboxes
search = []
l = len(checkboxes)
for i in range(0,l):
  if checkboxes[i].value == True:
    search.append(checkboxes[i].description)
print(search)

['TCP', 'sample', 'PLCL', 'composites', 'degradation', 'test', 'vitro', 'polymer', 'phosphate', 'copolymer', 'time', 'water', 'B-TCP', 'mass']


In [None]:
#extract test conditions based on checkboxes
count = 0
w = -50
l = len(f_text)
units_sear = ['kg','g','kgf','gf','N','kN','mg','°C','μL','ns','g/cm3','kg/m3','K','%','nm','cm','°','h','M','ml','rpm','°/min','mg/ml']
for tok in f_text:
  count+=1
  minimum = max(0,count-50)
  maximum = min(count+50,l)
  if tok in search:
    if count-w<49:
      minimum = w+49
    w=count
    for j in range(minimum,maximum):
      tag = taging([f_text[j]])
      pos = tag[0][1]
      #print(pos)
      if pos == 'CD' and (f_text[j+1]) in units_sear:
        if j>=2 and j<l-2:
          print(tok, ":", f_text[j-2], f_text[j-1], f_text[j], f_text[j+1], f_text[j+2])
        elif j<2:
          print(tok, ":", f_text[j], f_text[j+1], f_text[j+2])
        elif j==l-2:
          print(tok, ":", f_text[j], f_text[j+1])

composites : 20 35 50 % -tricalcium
B-TCP : et al. 60wt % 8-TCP
composites : glass bottle 20 ml Sorensen
vitro : weigh approximately 300 mg total
phosphate : measure resolution 0.02 % Merck
sample : point weigh 160 mg test
time : detection limit 0.02wt % e¢-caprolactone
phosphate : detection limit 0.02wt % e¢-caprolactone
degradation : vitro PLCL 50 % TCP
polymer : polymer decrease 14 % decrease
polymer : % decrease 29 % signal
polymer : composite contain 50 % £
B-TCP : composite contain 50 % £
polymer : vitro PLCL+ 50 % TCP
copolymer : vitro PLCL+ 50 % TCP
vitro : vitro PLCL+ 50 % TCP
degradation : decrease approximately 96 % initial
test : weeks decrease 99 % mer
composites : ®PLCL PLCL 10 % TCP
composites : w~ PLCL 20 % TCP
composites : ® PLCL 35 % TCP
composites : TCP O-PLCL 50 % TCP
polymer : L-lactide-co-¢-caprolactone PLCL 20 % tricalcium
time : 2.8 PLCL 10 % TCP
water : TCP PLCL 20 % TCP
mass : TCP PLCL 35 % TCP
test : TCP PLCL 50 % TCP
PLCL : absorption -PLCL 10 % TCP
PLCL : a

In [None]:
!pip install pandas python-docx

Collecting python-docx
  Downloading python-docx-0.8.11.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: python-docx
  Building wheel for python-docx (setup.py) ... [?25l[?25hdone
  Created wheel for python-docx: filename=python_docx-0.8.11-py3-none-any.whl size=184487 sha256=6927714091e412607edba64acf12e7f0406e0c4e045194da0b01b8eadf2da863
  Stored in directory: /root/.cache/pip/wheels/80/27/06/837436d4c3bd989b957a91679966f207bfd71d358d63a8194d
Successfully built python-docx
Installing collected packages: python-docx
Successfully installed python-docx-0.8.11


In [None]:
import pandas as pd
from docx import Document

titles = data_df['PDF']
doc = Document()
for title in titles:
    doc.add_heading(title[:-3], level=1)
count = 0
w = -50
l = len(f_text)
units_sear = ['kg','g','kgf','gf','N','kN','mg','°C','μL','ns','g/cm3','kg/m3','K','%','nm','cm','°','h','M','ml','rpm','°/min','mg/ml']
for tok in f_text:
    count += 1
    minimum = max(0, count - 50)
    maximum = min(count + 50, l)

    if tok in search:
        if count - w < 49:
            minimum = w + 49
        w = count
        for j in range(minimum, maximum):
            tag = taging([f_text[j]])
            pos = tag[0][1]

            if pos == 'CD' and (f_text[j + 1]) in units_sear:
                if j >= 2 and j < l - 2:
                    text = f"{tok} : {f_text[j - 2]} {f_text[j - 1]} {f_text[j]} {f_text[j + 1]} {f_text[j + 2]}"
                elif j < 2:
                    text = f"{tok} : {f_text[j]} {f_text[j + 1]} {f_text[j + 2]}"
                elif j == l - 2:
                    text = f"{tok} : {f_text[j]} {f_text[j + 1]}"

                # Add the text below the header
                doc.add_paragraph(text)

doc.save('extraction.docx')