In [1]:
import tkinter as tk
from tkinter import ttk
import torch.nn as nn
import torch.optim as optim
import torch
import pandas as pd
import numpy as np

In [2]:
import nltk
nltk.download('punkt')
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\muhammadawais.naeem\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\muhammadawais.naeem\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\muhammadawais.naeem\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\muhammadawais.naeem\AppData\Roaming\nltk_data
[nltk_data]     ...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
train_data = pd.read_csv('Dataset/train.csv')
train_data.head()

Unnamed: 0,Abstract Name,Text,Label
0,###24491034,The emergence of HIV as a chronic condition me...,BACKGROUND
1,###24491034,This paper describes the design and evaluation...,BACKGROUND
2,###24491034,This study is designed as a randomised control...,METHODS
3,###24491034,The intervention group will participate in the...,METHODS
4,###24491034,The program is based on self-efficacy theory a...,METHODS


In [4]:
word_to_idx = {}
idx = 0
max_sequence_length = 0

for sent in train_data['Text']:
    # Tokenize the text
    sent_split = sent.split()

    # Record the max sequence length to be later used for padding or truncating
    if len(sent_split) > max_sequence_length:
        max_sequence_length = len(sent_split)

    # Record the unique index for each word in the textual descriptions
    for word in sent_split:
        if word not in word_to_idx:
            word_to_idx[word] = idx
            idx = idx + 1

# Record the total input size to be length of the indexed dictionary
input_size = len(word_to_idx) + 2 # to be on the safer side
print(f"Input Size: {input_size}")
print(f"Max Sequence Length: {max_sequence_length}")

Input Size: 299826
Max Sequence Length: 338


In [5]:
def pad_or_truncate(sequence, length):
    if len(sequence) < length:
        return torch.cat([sequence, torch.zeros(length - len(sequence), dtype=torch.long)])
    else:
        return sequence[:length]

In [6]:
# The transform function wil skip the word which is not present in the initial dictionary
def text_transform_func(text):
    sentence_onehot = [word_to_idx[word] for word in text.split() if word in word_to_idx]
    sentence_onehot = torch.tensor(sentence_onehot, dtype=torch.long)
    sentence_onehot = pad_or_truncate(sentence_onehot, max_sequence_length)
    return sentence_onehot

In [7]:
class AbstractsModel(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, num_classes=5):
        super(AbstractsModel, self).__init__()
        # Specifying Embedding and GRU layers to process the text data
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.gru = nn.GRU(embedding_size, hidden_size, num_layers, batch_first=True)
        # Specifying Linear Layer to process the text data after the RNN Layer
        self.fc1 = nn.Linear(hidden_size, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, num_classes)

    def forward(self, x):
        # Applying embedding and RNN operation to text data for extraction of features
        out = self.embedding(x)
        out, _ = self.gru(out)
        # Apply mean pooling
        out = torch.mean(out, dim=1)
        # A fully connected layer to predict the outputs
        out = self.fc1(out)
        out = self.fc2(out)
        out = self.fc3(out)
        return out

In [8]:
# Initializing the Model Settings
embedding_size = 200
hidden_size = 64
num_layers = 2
input_size = 468389

# Create an instance of the abstract model and load the saved model
abstract_model = AbstractsModel(input_size, embedding_size, hidden_size, num_layers)
abstract_model.load_state_dict(torch.load('abstract_model.pth', map_location=torch.device('cpu')))

<All keys matched successfully>

In [9]:
example_abstract = '''
The aim of this paper is to map the scientific landscape related to cancer research worldwide between 2012 and 2017. We use scientific publication data from Web of Science Core Collection and combine bibliometrics and social network analysis techniques to identify the most relevant journals, research areas, countries and research organizations in cancer scientific landscape. The results show: Oncotarget as the journal with most publications; a significant increase in China’s publications, reaching United States’ publications in 2017; MD Cancer Center, University of California and Harvard University as organizations with most publications; cell biology as the most frequent research area; breast, lung and colorectal cancer as the most frequent keywords; high density of co-authorship between organizations in the West, especially in the US, and low density between organizations in Asian and lower and medium income countries. Our findings can be used to guide a global knowledge platform guiding policy, planning and funding decisions as well as to establish new institutional collaborations.
'''

# Split the abstract on the period (.)
abstract_lines = example_abstract.split('.')

# Replace the new line characters and remove the extra spaces at the start/end
abstract_lines = [line.replace('\n','').strip() for line in abstract_lines]

# Remove the empty lines 
abstract_lines = [line for line in abstract_lines if len(line) > 0]

In [10]:
background = []
objective = []
methods = []
results = []
conclusions = []

for text in abstract_lines:
    text_tensor = text_transform_func(text)
    text_tensor = text_tensor.reshape(1, -1)
    output = abstract_model(text_tensor)
    label = np.argmax(output.detach().cpu().numpy(), axis=1)
    
    if label == 0:
        background.append(text)
    elif label == 1:
        objective.append(text)
    elif label == 2:
        methods.append(text)
    elif label == 3:
        results.append(text)
    elif label == 4:
        conclusions.append(text)

print(f'Background: {background}')
print(f'Objective: {objective}')
print(f'Methods: {methods}')
print(f'Results: {results}')
print(f'Conclusions: {conclusions}')
        

Background: ['The aim of this paper is to map the scientific landscape related to cancer research worldwide between 2012 and 2017', 'We use scientific publication data from Web of Science Core Collection and combine bibliometrics and social network analysis techniques to identify the most relevant journals, research areas, countries and research organizations in cancer scientific landscape']
Objective: []
Methods: []
Results: []
Conclusions: ['The results show: Oncotarget as the journal with most publications; a significant increase in China’s publications, reaching United States’ publications in 2017; MD Cancer Center, University of California and Harvard University as organizations with most publications; cell biology as the most frequent research area; breast, lung and colorectal cancer as the most frequent keywords; high density of co-authorship between organizations in the West, especially in the US, and low density between organizations in Asian and lower and medium income countr

In [11]:
def clear_all_fields():
    abstract_text.delete(1.0, tk.END)
    abstract_status_label["text"] = ""
    background_text.delete(1.0, tk.END)
    objective_text.delete(1.0, tk.END)
    methods_text.delete(1.0, tk.END)
    results_text.delete(1.0, tk.END)
    conclusions_text.delete(1.0, tk.END)

    return None

In [12]:
def evaluate_model():
    
     # Firstly clear all the result text entries
     abstract_status_label["text"] = ""
     background_text.delete(1.0, tk.END)
     objective_text.delete(1.0, tk.END)
     methods_text.delete(1.0, tk.END)
     results_text.delete(1.0, tk.END)
     conclusions_text.delete(1.0, tk.END)
     
     # Read the text in abstract_text
     abstract_text_value = abstract_text.get("1.0", "end-1c").strip()
    
     if len(abstract_text_value) > 0: #abstract has some valid text in the box
          abstract_status_label["foreground"] = "green"
          abstract_status_label["text"] = "Valid Abstract"
          
          # Split the abstract on the period (.)
          abstract_lines = sent_tokenize(abstract_text_value)

          # Replace the new line characters and remove the extra spaces at the start/end
          abstract_lines = [line.replace('\n','').strip() for line in abstract_lines]

          # Remove the empty lines 
          abstract_lines = [line for line in abstract_lines if len(line) > 0]
          
          # Evaluate the model
          background = ''
          objective = ''
          methods = ''
          results = ''
          conclusions = ''

          for text in abstract_lines:
               text_tensor = text_transform_func(text)
               text_tensor = text_tensor.reshape(1, -1)
               output = abstract_model(text_tensor)
               label = np.argmax(output.detach().cpu().numpy(), axis=1)
          
               if label == 0:
                    background = background + text + ' '
               elif label == 1:
                    objective = objective + text + ' '
               elif label == 2:
                    methods = methods + text + ' '
               elif label == 3:
                    results = results + text + ' '
               elif label == 4:
                    conclusions = conclusions + text + ' '

          background_text.insert(tk.END, background)
          objective_text.insert(tk.END, objective)
          methods_text.insert(tk.END, methods)
          results_text.insert(tk.END, results)
          conclusions_text.insert(tk.END, conclusions)
        

     else: # there is no valid text available in the box
          abstract_status_label["foreground"] = "red"
          abstract_status_label["text"] = "Invalid Abstract"
         
         
     return None

In [13]:
window = tk.Tk()
window.title(string="Abstract Segmentation System")
window.geometry("1200x700")
window.rowconfigure(0, weight=1)
window.columnconfigure(0, weight=1)

frame_update = tk.Frame(master=window)
frame_update.grid(row=0, column=0, sticky='news')
frame_update.columnconfigure(0, weight=1, uniform='x')
frame_update.columnconfigure(1, weight=1, uniform='x')
frame_update.columnconfigure(2, weight=2, uniform='x')
frame_update.rowconfigure(0, weight=4, uniform='x')
frame_update.rowconfigure(1, weight=1, uniform='x')
frame_update.rowconfigure(2, weight=4, uniform='x')
frame_update.rowconfigure(3, weight=1, uniform='x')
frame_update.rowconfigure(4, weight=4, uniform='x')
frame_update.rowconfigure(5, weight=1, uniform='x')
frame_update.rowconfigure(6, weight=4, uniform='x')
frame_update.rowconfigure(7, weight=1, uniform='x')
frame_update.rowconfigure(8, weight=4, uniform='x')
frame_update.rowconfigure(9, weight=1, uniform='x')
frame_update.rowconfigure(10, weight=4, uniform='x')

heading_label = tk.Label(master=frame_update, text="ABSTRACT SEGMENTATION SYSTEM", font=('Open Sans', 32, 'bold'), foreground="green")
heading_label.grid(row=0, column=0, columnspan=3, sticky='news')

abstract_label = tk.Label(master=frame_update, text="Abstract", font=('Open Sans', 24), foreground= "purple")
abstract_label.grid(row=2, column=0, sticky='nw', padx=30)

reset_button = tk.Button(master=frame_update, text="Reset", font=('Open Sans', 18), width=10, command=clear_all_fields)
reset_button.grid(row=2, column=1, sticky="nw", padx=5)

abstract_text = tk.Text(master=frame_update, height=30, width=80, font=('Open Sans', 10), wrap='word')
abstract_text.grid(row=3, rowspan=6, column=0, columnspan=2, sticky='w', padx=30)

process_button = tk.Button(master=frame_update, text="Process", font=('Open Sans', 18), width=10, command=evaluate_model)
process_button.grid(row=10, column=0, sticky="w", padx=30)

abstract_status_label = tk.Label(master=frame_update, text="Upload Abstract", font=('Open Sans', 16))
abstract_status_label.grid(row=12, column=1, sticky='w')

background_label = tk.Label(master=frame_update, text="Background", font=('Open Sans', 18), foreground= "purple")
background_label.grid(row=1, column=2, sticky='w')

background_text = tk.Text(master=frame_update, height=4, width=70, font=('Open Sans', 10), wrap='word')
background_text.grid(row=2, column=2, sticky='w')

objective_label = tk.Label(master=frame_update, text="Objective", font=('Open Sans', 18), foreground= "purple")
objective_label.grid(row=3, column=2, sticky='w')

objective_text = tk.Text(master=frame_update, height=4, width=70, font=('Open Sans', 10), wrap='word')
objective_text.grid(row=4, column=2, sticky='w')

methods_label = tk.Label(master=frame_update, text="Methods", font=('Open Sans', 18), foreground= "purple")
methods_label.grid(row=5, column=2, sticky='w')

methods_text = tk.Text(master=frame_update, height=4, width=70, font=('Open Sans', 10), wrap='word')
methods_text.grid(row=6, column=2, sticky='w')

results_label = tk.Label(master=frame_update, text="Results", font=('Open Sans', 18), foreground= "purple")
results_label.grid(row=7, column=2, sticky='w')

results_text = tk.Text(master=frame_update, height=4, width=70, font=('Open Sans', 10), wrap='word')
results_text.grid(row=8, column=2, sticky='w')

conclusions_label = tk.Label(master=frame_update, text="Conclusions", font=('Open Sans', 18), foreground= "purple")
conclusions_label.grid(row=9, column=2, sticky='w')

conclusions_text = tk.Text(master=frame_update, height=4, width=70, font=('Open Sans', 10), wrap='word')
conclusions_text.grid(row=10, column=2, sticky='w')

# Clearing up all the text fields at the start
abstract_text.delete(1.0, tk.END)
abstract_status_label["text"] = ""
background_text.delete(1.0, tk.END)
objective_text.delete(1.0, tk.END)
methods_text.delete(1.0, tk.END)
results_text.delete(1.0, tk.END)
conclusions_text.delete(1.0, tk.END)

frame_update.tkraise()
window.mainloop()