In [1]:
# INTERACTIVE DOCUMENT CREATION RELATED IMPORTS
import json
import sys
import os 
import subprocess
import spacy
import nltk
from nltk.stem import WordNetLemmatizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from pybtex.database.input import bibtex
import urllib.request
from bs4 import BeautifulSoup

In [2]:
# CLASS RELATED IMPORTS
from abc import ABC, abstractmethod
from pydantic import BaseModel
from typing import List, Any
import unittest
import math
import random
from tqdm import tqdm

In [3]:
# MAP CREATION RELATED IMPORTS
from sentence_transformers import SentenceTransformer, util
from nltk.stem import WordNetLemmatizer
import numpy as np
import pandas as pd
import torch
import spacy
import umap 
import hdbscan
import matplotlib.pyplot as plt
from bertopic import BERTopic
import transformers
# IMPORTANT: ERROR FIX FOR huggingface transformers -> disable iprogress logging -> https://stackoverflow.com/questions/66644432/use-huggingface-transformers-without-ipywidgets
import logging
transformers.logging.get_verbosity = lambda: logging.NOTSET
from transformers import pipeline
from transformers import AutoTokenizer

# Core Variables

In [4]:
# INTERACTIVE DOCUMENT CREATION
### pdf folder 
pdf_files_folder =  './dataset/' 
# folder to the json file that contains the parsed pdfs -> parsed by allenai science parser library: https://github.com/allenai/science-parse/tree/master/cli
allen_ai_parsed_output_json_file_path =  'dataset.json' 
# command to call the allen ai science parser jar file from the console with the respective path
allen_ai_jar_command = 'java -Xmx6g -jar ./pip_stuff/science-parse-cli-assembly-2.0.3.jar' + ' ' + pdf_files_folder + ' ' + '-f' + ' ' + allen_ai_parsed_output_json_file_path
# spacy model
nlp = spacy.load('en_core_web_sm')
# number of keywords per doc, OUT OF USE, we use percentage instead to be adaptive to document size
#number_of_keywords_per_document = 20
# percentage of document words extracted and used as keywords, we determine the number of keywords by the LENGTH of the document, NOT the number of different words
# -> that means, out of a document with a length of 1000 words we extract the 'keyword_percentage_per_document' number of keywords (e.g. 10 percent => 100 keywords)
keyword_percentage_per_document = 5 # 5

In [5]:
# MAP CREATION 
allenai_specter_model_dir = "" # SET YOUR PATH HERE
allenai_scibert_model_dir = "" # SET YOUR PATH HERE

paragraph_max_length = 350 
# preprocessed interactive_document_corpus file path for MAP CREATION
interactive_document_corpus_full_file_path = 'corpus_full.txt'
# preprocessed interactive_document_corpus file path for FRONTEND USAGE
interactive_document_corpus_file_path = 'corpus.txt' 
# path for the base map points
interactive_document_corpus_base_map_points_file_path =  'corpus_base_map_points.txt'
# path for the extracted points of the document corpus 
interactive_document_corpus_corpus_points_file_path = 'corpus_points.txt'
# path for the topic_corpus which is the extracted topic model
interactive_document_corpus_topic_corpus_file_path = 'topic_corpus.txt'
# DEPRECATED -> now done all in one in the main corpus.txt files
# path for the extracted points for the paragraph vectors
#interactive_document_corpus_paragraph_map_points_file_path = 'paragraph_points.txt'
# path for the topic points of the topic map
#interactive_document_corpus_topic_map_points_file_path =  'topic_map_points.txt'

In [6]:
# CUSTOM VARIABLES
# recovery news data set for covid 19 news comparison, from here: https://github.com/apurvamulay/ReCOVery
recovery_news_data_csv_file_path = 'datasets/covid_news_dataset/recovery-news-data.csv' 

# STOPWORDS LIST
STOPWORDS_LIST = ["et", "al", ".", "k", "t", "m", "r", "p", "qa"] # scientific document specific stopwords to be ignored in keyword extraction

# Classes

## Interactive Document

In [7]:
# ENTITY
class InteractiveDocument(BaseModel):
    # Object Attributes
    uri: str = None
    title: str = None
    timestamp: str = None
    rating: str = None
    annotation: str = None
    tags: List[str] = None
    authors: List[str] = None
    summary: str = None
    keywords: List[str] = None
    document_vector: List[int] = None
    paragraphs: List[Any] = []
    content: dict = {}
    # additional specific properties
    doi: str = None
        
# REPOSITORY
class InteractiveDocumentRepository(ABC):
    @abstractmethod
    def test(self):
        pass
    
# REPOSITORY IMPLEMENTATION
class InteractiveDocumentRepositoryImpl(InteractiveDocumentRepository):
    # Constructor / if needed initialize iunstance attributes here
    def __init__(self, interactive_document_data_object: InteractiveDocument):
        self.data_object = interactive_document_data_object
        print('{0} initialized.'.format(self.__class__.__name__))
    
    # methods
    def test(self):
        print('test called.')

# TEST
class TestInteractiveDocument(unittest.TestCase):

    @classmethod
    def setUpClass(self):
        # set all things up for the test series here
        pass

    @classmethod
    def tearDownClass(self):
        # tear everything down after testing here
        pass

    def test_class_setup_and_serialization(self):
        # given
        interactive_document_data_object = InteractiveDocument()
        interactive_document_repository_impl = InteractiveDocumentRepositoryImpl(interactive_document_data_object)
        # when
        interactive_document_repository_impl.test()
        print(interactive_document_repository_impl.data_object.dict())
        # then
        result = 6
        self.assertEqual(result, 6)

## Preprocessing

In [8]:
# ENTITY ---------------------------------------------------------------------------
class PreprocessingManager(BaseModel):
    '''
    PreprocessingManager: class that enables all preprocessing 
    '''
    def __init__(self):
        print('preprocessing manager constructed.')

# REPOSITORY ----------------------------------------------------------------------
class PreprocessingManagerRepository(ABC):
    @abstractmethod
    def parse_pdfs_to_interactive_documents(pdf_folder_path):
        pass
    @abstractmethod
    def create_summary(interactive_document):
        pass
    @abstractmethod
    def extract_keywords(interactive_document, keyword_percentage_per_document):
        pass
    @abstractmethod
    def preprocess_raw_text(interactive_document):
        pass
    @abstractmethod
    def write_interactive_document_corpus_to_file(interactive_document_corpus, file_path):
        pass
    
# REPOSITORY IMPLEMENTATION -------------------------------------------------------     
class PreprocessingManagerRepositoryImpl(PreprocessingManagerRepository):
    # Constructor / if needed initialize iunstance attributes here
    def __init__(self, preprocessing_manager_data_object: PreprocessingManager):
        self.data_object = preprocessing_manager_data_object
        self.text_processor = spacy.load('en_core_web_sm')
        print('{0} initialized.'.format(self.__class__.__name__))
    
    # override methods
    #@override
    def create_summary(self, interactive_document):
        print('test')
        
    #@override
    def extract_keywords(self, interactive_document, keyword_percentage_per_document, method="textrank", must_have_keyword=None):
        preprocessed_text = interactive_document.content['preprocessed_text']
        if(method=="textrank"):
            tr4w = TextRank4Keyword()
            tr4w.analyze(preprocessed_text, candidate_pos = ['NOUN', 'PROPN', 'VERB'], window_size=4, lower=True, stopwords=STOPWORDS_LIST) # window_size=5
            # get the number of different words in this document
            vocab = tr4w.vocab
            vocab_length = len(vocab)
            #print(vocab)
            #print(vocab_length)
            number_of_keywords_for_document = int((vocab_length / 100) * keyword_percentage_per_document)
            keywords = tr4w.get_keywords(number_of_keywords_for_document, must_have_keyword)
        elif(method=="rake"):
            #number_of_keywords_per_document = 20
            #rake_keyword_extractor = Rake(min_length=1, max_length=3)
            #rake_keyword_extractor.extract_keywords_from_text(preprocessed_text)
            #keywords = rake_keyword_extractor.get_ranked_phrases()
            #keywords = ' '.join(keywords)
            #keywords = nltk.word_tokenize(keywords)
            #keywords = [token.lower() for token in keywords]
            # take the n most important keywords
            #if (len(keywords) > number_of_keywords_per_document):
            #    keywords = keywords[:number_of_keywords_per_document]
            keywords = [] # TODO: find other implementation of RAKE algorithm
        else:
            print('ERROR: no valid method for keyword extraction provided.')
        return keywords
    
    #@override
    def preprocess_raw_text(self, interactive_document):
        whole_document_text_string = interactive_document.content['raw_text']
        # replace any unicode surrogates like: u"\u002d" with the emtpty string "", found here: https://stackoverflow.com/questions/59952915/in-python-3-how-do-you-remove-all-non-utf8-characters-from-a-string
        whole_document_text_string = whole_document_text_string.encode('utf-8','ignore').decode('utf8')
        text = self.text_processor(whole_document_text_string)
        cleaned_text = []
        for token in text:
            if not token.is_space and not token.is_bracket and (token.is_alpha or (token.text.lower()=='.') or (token.text.lower()==',') or (token.text.lower()=='?') or (token.text.lower()==':') or (token.text.lower()=='-')):
                cleaned_text.append(token.text.lower())   #(token.lemma_.lower())
        return ' '.join(cleaned_text)
        
    #@override
    def parse_pdfs_to_interactive_documents(self, pdf_files_folder_path): 
        # 1. parse plain pdfs from folder into json objects
        command = 'java -Xmx6g -jar /pip_stuff/science-parse-cli-assembly-2.0.3.jar' + ' ' + pdf_files_folder_path + ' ' + '-f' + ' ' + allen_ai_parsed_output_json_file_path
        self.parse_pdfs_from_folder_into_json_objects(command)
        # 2. load parsed pdf documents from json
        #corpus_objects_list = self.load_parsed_pdf_documents_from_json(allen_ai_parsed_output_json_file_path)
        # 3. parse the data from the json objects into interactive document objects
        #interactive_document_corpus = self.parse_data_from_json_to_interactive_document_object(corpus_objects_list)
        interactive_documents = []
        return interactive_document_corpus
     
    #@override
    def write_interactive_document_corpus_to_file(self, interactive_document_corpus, file_path, remove_content=False):
        print('...writing interactive document corpus to file.')
        print('remove_content: {0}'.format(remove_content))
        serialized_interactive_document_corpus = []
        for i, interactive_document in enumerate(interactive_document_corpus):
            # filter out the content (raw_text, preprocessed_text, ...), e.g. to reduce overhead in frontend 
            if(remove_content):
                interactive_document.content = {}
            serialized_interactive_document = interactive_document.dict()
            serialized_interactive_document_corpus.append(serialized_interactive_document)
            #interactive_document_corpus[i] = serialized_interactive_document
        with open(file_path, 'w') as output_file:
            json.dump(serialized_interactive_document_corpus, output_file)
        print('DONE: writing corpus objects list to file completed.')
    
    # utility methods
    # find AllenAi science parser Repo here: https://github.com/allenai/science-parse/blob/master/cli/README.md
    #process = subprocess.run(cmd.split(), check=True, stdout=subprocess.PIPE, universal_newlines=True)
    #print(process.stdout)
    def get_doi_list_from_bibtex_file(self, bibtex_file_path):
        print('...extracting doi list from bibtex file.')
        parser = bibtex.Parser()
        bib_database = parser.parse_file(bibtex_file_path)
        # FIX package specific DICT problems => convert specific dict to normal list
        # converting the specific bibliography object DICT into a LIST of data objects
        dictlist = []
        for key, value in bib_database.entries.items():
            temp = [key,value]
            dictlist.append(temp)
        #print(dictlist[0][1].fields['url'])
        #print(dictlist[0])
        bib_object_list = []
        for dict_obj in dictlist:
            bib_object_list.append(dict_obj[1].fields)
        #print(bib_object_list[0])
        doi_list = []
        print(bibtex_file_path)
        print('len bib_object_list: ' + str(len(bib_object_list)))
        for index, bib_object in enumerate(bib_object_list):
            #print(bib_data.entries[entry_key])
            if('doi' in bib_object):
                doi_list.append(bib_object['doi'])
            elif('DOI' in bib_object):
                doi_list.append(bib_object['DOI'])
            #else:
                #print('ERROR at document number: {0}'.format(index))
        print('len doi_list: ' + str(len(doi_list)))
        print('DONE: doi list successfully extracted.')
        return doi_list
    
    def get_url_list_from_bibtex_file(self, bibtex_file_path):
        print('...extracting url list from bibtex file.')
        parser = bibtex.Parser()
        bib_database = parser.parse_file(bibtex_file_path)
        # FIX package specific DICT problems => convert specific dict to normal list
        # converting the specific bibliography object DICT into a LIST of data objects
        dictlist = []
        for key, value in bib_database.entries.items():
            temp = [key,value]
            dictlist.append(temp)
        #print(dictlist[0][1].fields['url'])
        #print(dictlist[0])
        bib_object_list = []
        for dict_obj in dictlist:
            bib_object_list.append(dict_obj[1].fields)
        #print(bib_object_list[0])
        url_list = []
        print(bibtex_file_path)
        print('len bib_object_list: ' + str(len(bib_object_list)))
        for index, bib_object in enumerate(bib_object_list):
            #print(bib_data.entries[entry_key])
            if('url' in bib_object):
                url_list.append(bib_object['url'])
            elif('URL' in bib_object):
                url_list.append(bib_object['URL'])
            #else:
                #print('ERROR at document number: {0}'.format(index))
        print('len url_list: ' + str(len(url_list)))
        print('DONE: url list successfully extracted.')
        return url_list
    
    def download_pdfs_from_doi_list(self, doi_list, output_directory):
        print('...downloading pdfs from doi list.')
        for index, doi in enumerate(tqdm(doi_list)):
            html_url = 'https://sci-hub.se/' + str(doi) # scihub alternatives: ...
            response = urllib.request.urlopen(html_url)
            response_bytes = response.read()
            response_html_string = response_bytes.decode('utf8')
            response.close()
            #print(response_html_string)
            soup = BeautifulSoup(response_html_string, 'html.parser')
            try:
                pdf_url = soup.find('embed', {'id': 'pdf'}).attrs['src'].split('#')[0]
                print(pdf_url)
                if(pdf_url.startswith('//')):
                    preposition = 'https:'
                    pdf_url = preposition + pdf_url
            except:
                try:
                    pdf_url = soup.find('iframe', {'id': 'pdf'}).attrs['src'].split('#')[0]
                    print(pdf_url)
                    if(pdf_url.startswith('//')):
                        preposition = 'https:'
                        pdf_url = preposition + pdf_url
                except:
                    continue
            #print(pdf_url)
            paper_output_file_path = output_directory + 'doi_' + str(index) + '_' + '.pdf'
            try: # wrong url problem: unknown url type: '//sci-hub.se/downloads/2019-01-22//82/xinzhao2013.pdf?rand=6107eb6429921' -> fix by replacing start with https://...
                urllib.request.urlretrieve(pdf_url, paper_output_file_path)
            except Exception as e:
                print('DOI paper link not reachable at index ' + str(index))
                print(e)
                continue
        print('DONE: pdfs from doi list successfully downloaded.')
    
    def download_pdfs_from_url_list(self, url_list, output_directory):
        print('...downloading pdfs from url list.')
        for index, url in enumerate(tqdm(url_list)):
            pdf_url = url + '.pdf'
            paper_output_file_path = output_directory + 'url_paper_' + str(index) + '.pdf'
            try:
                urllib.request.urlretrieve(pdf_url, paper_output_file_path)
            except:
                print('ERROR: ' + str(url) + ' could not be loaded.')
                continue
        print('DONE: pdfs from url list successfully downloaded.')
    
    def parse_pdfs_from_folder_into_json_objects(self, command):
        print('...parsing pdfs into json objects.')
        process = subprocess.Popen(command.split(), stdout=subprocess.PIPE, universal_newlines=True)
        # wait until the process is finished
        exit_code = process.wait()
        if exit_code == 0:
            print(exit_code)
            print('DONE: pdf parsing SUCCESSFUL.')
        else:
            print(exit_code)
            print('ERROR: pdf parsing FAILED.')
        
    # objects are written linewise into the json file, thats why we have to grab them by line
    def load_parsed_pdf_documents_from_json(self, allen_ai_parsed_output_json_file_path):
        corpus_objects_list = []
        with open(allen_ai_parsed_output_json_file_path) as f:
            for line in f:
                corpus_objects_list.append(json.loads(line))
        # printing name of function that called found here: https://stackoverflow.com/questions/5067604/determine-function-name-from-within-that-function-without-using-traceback
        print('DONE: loading parsed documents from json completed.')
        return corpus_objects_list
    
    def parse_data_from_json_to_interactive_document_object(self, corpus_objects_list, corpus_doi_list=None):
        print('...parsing data from json to interactive document objects.')
        interactive_document_corpus = []
        for i, raw_source_object in enumerate(corpus_objects_list):
            interactive_document = InteractiveDocument()
            interactive_document.content['raw_text'] = ""
            # get additional information to source object
            interactive_document.uri = i # DOI_list[i], or in extra variable as seen below
            interactive_document.doi = corpus_doi_list[i] if (corpus_doi_list != None) else 'empty'
            interactive_document.title = raw_source_object['metadata']['title'] if (str(raw_source_object['metadata']['title']) != 'null' and raw_source_object['metadata']['title'] != None) else 'empty'
            interactive_document.authors = raw_source_object['metadata']['authors'] if (str(raw_source_object['metadata']['authors']) != 'null' and raw_source_object['metadata']['authors'] != None) else []
            interactive_document.timestamp = raw_source_object['metadata']['year'] if (str(raw_source_object['metadata']['year']) != 'null' and raw_source_object['metadata']['year'] != None) else 'empty'
            interactive_document.rating = 0.0
            interactive_document.annotation = ""
            interactive_document.tags = []
            interactive_document.paragraphs = []
            interactive_document.document_vector = [0,0]
            interactive_document.summary = raw_source_object['metadata']['abstractText'] if (str(raw_source_object['metadata']['abstractText']) != 'null' and raw_source_object['metadata']['abstractText'] != None) else ''
            # check if the source object contains text sections at all
            if(raw_source_object['metadata']['sections'] != None):
                # get text of the object by looping through document sections
                for j, section in enumerate(raw_source_object['metadata']['sections']):
                    if(section['heading'] != None):
                        #print(section['heading'])
                        # add the title section without leading space
                        if(j == 0):
                            interactive_document.content['raw_text'] += section['text']
                        else:
                            # filter out certain sections, e.g. acknowledgement section
                            if(section['heading'] == 'Acknowledgements'):
                                continue
                            interactive_document.content['raw_text'] = interactive_document.content['raw_text'] + " " + section['text']
            else:
                print('INFO: source {0} of json data has no raw text SECTIONS.'.format(i))
            # add created interactive document to corpus
            interactive_document_corpus.append(interactive_document)
        print('DONE: parse_data_from_json_to_interactive_document_object completed.')
        return interactive_document_corpus
    
    def parse_data_from_csv_to_interactive_document_object(self, corpus_csv_file_path):
        '''this is an example implementation for a csv parser. in future we need a generalized parser to bring different data (e.g. pfds, csvs, docs, ...) into the interactive document corpus format.'''
        print('...parsing data from csv to interactive document objects.')
        # load the dataset from csv
        covid_news_df = pd.read_csv(corpus_csv_file_path)
        #covid_news_df.head()
        # filter out only the reliable data sources
        reliable_covid_news_df = covid_news_df.loc[covid_news_df['reliability'] > 0]
        reliable_covid_news_df = reliable_covid_news_df.reset_index()
        # ALTERNATIVE: SHUFFLE DATA FRAME: e.g. needed when only a subset is needed for visualization => IMPORTANT: be aware: this breaks frontend procedures!
        #reliable_covid_news_df = reliable_covid_news_df.sample(frac=1).reset_index(drop=True)
        #reliable_covid_news_df.head()
        #print(reliable_covid_news_df.shape)
        # iterate over the rows to fill the interactive document objects for the interactive document corpus
        interactive_document_corpus = []
        reliable_covid_news_df_records = reliable_covid_news_df.to_dict('records')
        for index, row in enumerate(reliable_covid_news_df_records):
            interactive_document = InteractiveDocument()
            interactive_document.content['raw_text'] = row['body_text']
            interactive_document.uri = index
            interactive_document.title = row['title'] if (str(row['title']) != 'null' and row['title'] != None) else 'empty'
            interactive_document.authors = str(row['author']).replace('\'', '').strip('][').split(', ') if (str(row['author']) != 'null' and row['author'] != None) else []
            interactive_document.timestamp = row['publish_date'] if (str(row['publish_date']) != 'null' and row['publish_date'] != None) else 'empty'
            interactive_document.rating = 0.0
            interactive_document.annotation = ""
            interactive_document.tags = []
            interactive_document.paragraphs = []
            interactive_document.document_vector = [0,0]
            interactive_document.summary = row['body_text'] if (str(row['body_text']) != 'null' and row['body_text'] != None) else ''
            #print(len(interactive_document.summary))
            # CREATE SUMMARIES
            # we take the first 1800 characters as the abstract of each document
            interactive_document.summary = (interactive_document.summary[:1800] + ' .') if len(interactive_document.summary) > 1800 else interactive_document.summary
            # we have to preprocess the summary to make sure it is clean
            interactive_document.summary = interactive_document.summary.encode('utf-8','ignore').decode('utf8')
            summary_text = self.text_processor(interactive_document.summary)
            cleaned_text = []
            for token in summary_text:
                if not token.is_space and not token.is_bracket and (token.is_alpha or (token.text.lower()=='.') or (token.text.lower()==',') or (token.text.lower()=='?') or (token.text.lower()==':') or (token.text.lower()=='-')):
                    cleaned_text.append(token.text.lower())   #(token.lemma_.lower())
            interactive_document.summary = ' '.join(cleaned_text)
            interactive_document_corpus.append(interactive_document)
        print('DONE: parse_data_from_json_to_interactive_document_object completed.')
        return interactive_document_corpus
    
    def parse_custom_data_to_interactive_document_object(self, custom_data_file_path):
        print('test')
        # 1. Load custom dataset
        # a) meaning shift data set
        # b) 
        # c) 20 newsgroups data set 
        # d) enron dataset -> emails
        # f) google blogger corpus -> blogs
        # g) 
    
    def preprocess_raw_text_in_corpus(self, interactive_document_corpus):
        print('... preprocessing raw text of whole corpus.')
        for interactive_document in tqdm(interactive_document_corpus):
            preprocessed_text = self.preprocess_raw_text(interactive_document)
            interactive_document.content['preprocessed_text'] = preprocessed_text
        print('DONE: preprocessing raw text of whole corpus completed.')
        return interactive_document_corpus
    
    def create_keywords_in_corpus(self, interactive_document_corpus, must_have_keyword=None):
        print('...creating_keywords in whole corpus.')
        for interactive_document in tqdm(interactive_document_corpus):
            keywords = self.extract_keywords(interactive_document, keyword_percentage_per_document, must_have_keyword=must_have_keyword)
            interactive_document.keywords = keywords
        print('DONE: keywords in corpus successfully created.')
        return interactive_document_corpus
    
    def create_summaries_in_corpus():
        pass
        
# TEST ----------------------------------------------------------------------------
class TestPreprocessingManager(unittest.TestCase):

    @classmethod
    def setUpClass(self):
        # set all things up for the test series here
        pass

    @classmethod
    def tearDownClass(self):
        # tear everything down after testing here
        pass

    def test_class_setup_and_serialization(self):
        # given
        preprocessing_manager_data_object = PreprocessingManager()
        preprocessing_manager_repository_impl = PreprocessingManagerRepositoryImpl(preprocessing_manager_data_object)
        # when
        preprocessing_manager_repository_impl.create_summary()
        print(preprocessing_manager_repository_impl.data_object.dict())
        # then
        #result = 6
        #self.assertEqual(result, 6)

## Keyword Extraction

In [9]:
# find on github here: https://gist.github.com/BrambleXu/3d47bbdbd1ee4e6fc695b0ddb88cbf99
# find tutorial here: https://towardsdatascience.com/textrank-for-keyword-extraction-by-python-c0bae21bcec0
from collections import OrderedDict
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load('en_core_web_sm')

class TextRank4Keyword():
    """Extract keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words"""
        for word in STOP_WORDS.union(set(stopwords)):
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        
        return g_norm

    
    def get_keywords(self, number=10, must_have_keyword=None):
        """Print top number keywords"""
        keywords = []
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            #print(key + ' - ' + str(value))
            keywords.append(key)
            if i > number:
                break
        # check if must_have_keyword in keywords, if not -> add it
        if(must_have_keyword is not None):
            if(str(must_have_keyword) not in keywords):
                for i, (key, value) in enumerate(node_weight.items()):
                    if(str(key) == str(must_have_keyword)):
                        keywords.append(key)
                        break
        return keywords
        
        
    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stop words
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # safe vocabulary to object variable
        self.vocab = vocab
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Get normalized matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight
        
# Usage
#text = '''
#The Wandering Earth, described as China’s first big-budget science fiction thriller, quietly made it onto screens at AMC theaters in North America this weekend, and it shows a new side of Chinese filmmaking — one focused toward futuristic spectacles rather than China’s traditionally grand, massive historical epics. At the same time, The Wandering Earth feels like a throwback to a few familiar eras of American filmmaking. While the film’s cast, setting, and tone are all Chinese, longtime science fiction fans are going to see a lot on the screen that reminds them of other movies, for better or worse.
#'''

#tr4w = TextRank4Keyword()
#tr4w.analyze(text, candidate_pos = ['NOUN', 'PROPN', 'VERB'], window_size=5, lower=True)
#tr4w.get_keywords(10)

In [10]:
# LexRank implementation, from: https://github.com/UKPLab/sentence-transformers/blob/8a87467870a43b5662372366d8433f8a1f017417/examples/applications/text-summarization/LexRank.py
import numpy as np
from scipy.sparse.csgraph import connected_components

def degree_centrality_scores(
    similarity_matrix,
    threshold=None,
    increase_power=True,
):
    if not (
        threshold is None
        or isinstance(threshold, float)
        and 0 <= threshold < 1
    ):
        raise ValueError(
            '\'threshold\' should be a floating-point number '
            'from the interval [0, 1) or None',
        )

    if threshold is None:
        markov_matrix = create_markov_matrix(similarity_matrix)

    else:
        markov_matrix = create_markov_matrix_discrete(
            similarity_matrix,
            threshold,
        )

    scores = stationary_distribution(
        markov_matrix,
        increase_power=increase_power,
        normalized=False,
    )

    return scores


def _power_method(transition_matrix, increase_power=True):
    eigenvector = np.ones(len(transition_matrix))

    if len(eigenvector) == 1:
        return eigenvector

    transition = transition_matrix.transpose()

    while True:
        eigenvector_next = np.dot(transition, eigenvector)

        if np.allclose(eigenvector_next, eigenvector):
            return eigenvector_next

        eigenvector = eigenvector_next

        if increase_power:
            transition = np.dot(transition, transition)


def connected_nodes(matrix):
    _, labels = connected_components(matrix)

    groups = []

    for tag in np.unique(labels):
        group = np.where(labels == tag)[0]
        groups.append(group)

    return groups


def create_markov_matrix(weights_matrix):
    n_1, n_2 = weights_matrix.shape
    if n_1 != n_2:
        raise ValueError('\'weights_matrix\' should be square')

    row_sum = weights_matrix.sum(axis=1, keepdims=True)

    return weights_matrix / row_sum


def create_markov_matrix_discrete(weights_matrix, threshold):
    discrete_weights_matrix = np.zeros(weights_matrix.shape)
    ixs = np.where(weights_matrix >= threshold)
    discrete_weights_matrix[ixs] = 1

    return create_markov_matrix(discrete_weights_matrix)


def graph_nodes_clusters(transition_matrix, increase_power=True):
    clusters = connected_nodes(transition_matrix)
    clusters.sort(key=len, reverse=True)

    centroid_scores = []

    for group in clusters:
        t_matrix = transition_matrix[np.ix_(group, group)]
        eigenvector = _power_method(t_matrix, increase_power=increase_power)
        centroid_scores.append(eigenvector / len(group))

    return clusters, centroid_scores


def stationary_distribution(
    transition_matrix,
    increase_power=True,
    normalized=True,
):
    n_1, n_2 = transition_matrix.shape
    if n_1 != n_2:
        raise ValueError('\'transition_matrix\' should be square')

    distribution = np.zeros(n_1)

    grouped_indices = connected_nodes(transition_matrix)

    for group in grouped_indices:
        t_matrix = transition_matrix[np.ix_(group, group)]
        eigenvector = _power_method(t_matrix, increase_power=increase_power)
        distribution[group] = eigenvector

    if normalized:
        distribution /= n_1

    return distribution

## Map Creator 

In [11]:
# ENTITY
class MapCreator(BaseModel):
    '''
    MapCreator: class that takes interactive documents and creates 
    '''
    # Class Variables
    interactive_document_corpus: List[Any] = None
    paragraph_corpus: List[str] = None
    doc_2_paragraph_index: Any
    paragraph_2_doc_index: Any
    contextualized_word_vectors: Any
    contextualized_word_labels: List[str] = None
    contextualized_word_2_doc_index: List[int] = None
    contextualized_word_map: Any
    document_vectors: Any
    document_labels: List[str] = None
    document_map: Any
    paragraph_vectors: Any
    paragraph_labels: List[str] = None
    paragraph_map: Any
    topic_vectors: Any
    topic_labels: List[str] = None
    topic_corpus: List[Any] = None
    semantic_map: Any 
    
        
# REPOSITORY
class MapCreatorRepository(ABC):
    @abstractmethod
    def load_interactive_document_corpus(self, method:str, interactive_document_corpus_uri:str):
        pass
    @abstractmethod
    def doc_id_2_paragraph_ids(self, interactive_document_id:int):
        pass
    @abstractmethod
    def paragraph_id_2_doc_id(self, paragraph_id:int):
        pass
    @abstractmethod
    def create_paragraph_embedding(self, paragraph:str):
        pass
    @abstractmethod
    def create_word_embeddings(self, paragraph:str, keywords:List[str]):
        pass
    @abstractmethod
    def get_topic_vectors(self):
        pass
    @abstractmethod
    def get_document_vectors(self):
        pass
    @abstractmethod
    def get_paragraph_vectors(self):
        pass
    @abstractmethod
    def get_contextualized_word_vectors(self):
        pass
    def get_semantic_map(self):
        pass
    # ... more to come ...
    
# REPOSITORY IMPLEMENTATION
class MapCreatorRepositoryImpl(MapCreatorRepository):
    # Constructor / if needed initialize iunstance attributes here
    def __init__(self, map_creator_data_object: MapCreator):
        self.data_object = map_creator_data_object
        # ininitalize text processor
        self.text_processor = spacy.load('en_core_web_sm')
        print('Textprocessor: SpaCy initialized.') 
        # initialize the Sentence Transformer for the Embedding Model
        self.sentence_model = SentenceTransformer(allenai_specter_model_dir, device="cuda") #SentenceTransformer(allenai_scibert_model_dir, device="cuda") #SentenceTransformer('all-mpnet-base-v2', device="cuda")
        self.bert_tokenizer = AutoTokenizer.from_pretrained(allenai_specter_model_dir, do_lower_case=True) #AutoTokenizer.from_pretrained(allenai_scibert_model_dir, do_lower_case=True) #AutoTokenizer.from_pretrained(allenai_specter_model_dir, do_lower_case=True)
        #self.bert_tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2', do_lower_case=True)
        #Change the length to 512
        self.sentence_model.max_seq_length = 512
        print("Sentence Transformer Max Sequence Length: ", self.sentence_model.max_seq_length)
        # change tokenizer to allen_ai_specter tokenizer
        self.sentence_model.tokenizer = self.bert_tokenizer
        print("Sentence Transformer Tokenizer: ", self.sentence_model.tokenizer)
        print('Sentence Transformer: allenai-specter initialized.')
        print('{0} initialized.'.format(self.__class__.__name__))
    
    # override methods
    #@override
    def load_interactive_document_corpus(self, interactive_document_corpus_uri:str, method:str):
        if(method=='load_from_file'):
            print('... loading interactive_document_corpus from: {0}'.format(interactive_document_corpus_uri))
            # load corpus from txt file
            interactive_document_corpus = self.load_interactive_document_corpus_from_file(interactive_document_corpus_uri)
            # set a reference of the corpus to the map creator data object
            self.data_object.interactive_document_corpus = interactive_document_corpus
            return interactive_document_corpus
        elif(method=='load_from_web_resource'):
            print('method not implemented yet.')
            return []
        else:
            print('ERROR: No valid method provided.')
            return []
    
    #@override
    def doc_id_2_paragraph_ids(self, interactive_document_id:int):
        '''doc_id_2_paragraph_ids: function takes a document id and gives back a list of paragraph indices.'''
        return self.data_object.doc_2_paragraph_index[interactive_document_id]
    #@override
    def paragraph_id_2_doc_id(self, paragraph_id:int):
        '''paragraph_id_2_doc_id: function takes a paragraph index and gives back a document index.'''
        return self.data_object.paragraph_2_doc_index[paragraph_id]
    #@override
    def create_paragraph_embedding(self, paragraph:str):
        print('')
    #@override 
    def create_word_embeddings(self, paragraph:str, keywords:List[str]):
        print('')
    #@override
    def get_topic_vectors(self):
        return self.data_object.topic_vectors
    
    #@override
    def get_contextualized_word_vectors(self):
        return self.data_object.contextualized_word_vectors
    
    #@override
    def get_document_vectors(self):
        return self.data_object.document_vectors
    
    #@override
    def get_paragraph_vectors(self):
        return self.data_object.paragraph_vectors
    
    #@override
    def get_semantic_map(self):
        return self.data_object.semantic_map
        
    # utility methods
    def load_interactive_document_corpus_from_file(self, corpus_txt_file_path):
        # load objects from json file
        with open(corpus_txt_file_path) as json_file:
            interactive_document_corpus = json.load(json_file)
        # parse raw strings into objects
        for i, raw_document in enumerate(interactive_document_corpus):
            interactive_document = InteractiveDocument.parse_obj(raw_document)
            interactive_document_corpus[i] = interactive_document
        print('loading interactive_document_corpus completed.')
        return interactive_document_corpus
    
    def get_paragraphs_from_doc(self, interactive_document:InteractiveDocument, paragraph_length:int):
        preprocessed_text = interactive_document.content['preprocessed_text']
        preprocessed_text = self.text_processor(preprocessed_text)
        tokenized_text = []
        # transform text into token list
        for token in preprocessed_text:
            tokenized_text.append(token.text)
        # divide text into sublists
        list_of_tokenized_sublists = list(self.divide_list_into_sublists(tokenized_text, paragraph_length))
        paragraphs = []
        # turn each paragraph back into a string
        for single_paragraph_token_list in list_of_tokenized_sublists:
            single_paragraph = ' '.join(single_paragraph_token_list)
            paragraphs.append(single_paragraph)
        # check for the special case, if no paragraphs are found in the document, then we just deliver an empty paragraph to be consistent for further paragraph counting
        if(len(paragraphs)==0):
            print('WARNING: no text paragraphs detected in document!')
            empty_paragraph = ''
            paragraphs.append(empty_paragraph)
        return paragraphs
    
    def divide_list_into_sublists(self, input_list, sublist_length):
        # looping till length l 
        for i in range(0, len(input_list), sublist_length):  
            yield input_list[i:i + sublist_length]   
    
    def create_paragraph_corpus_and_doc_2_par_indices(self, interactive_document_corpus, paragraph_length:int):
        print('... creating paragraph_corpus and doc_2_par indices.')
        # 1. initialize doc_2_paragraph_index and paragraph_to_doc_index
        self.data_object.paragraph_corpus = []
        self.data_object.doc_2_paragraph_index = []
        self.data_object.paragraph_2_doc_index = []
        # 2. for every document build the paragraphs
        for i, interactive_document in enumerate(interactive_document_corpus):
            paragraphs = self.get_paragraphs_from_doc(interactive_document, paragraph_length)
            # add the paragraphs to the data_objects paragraph_corpus
            for paragraph in paragraphs:
                self.data_object.paragraph_corpus.append(paragraph)
            document_paragraph_list = []
            number_of_paragraphs = len(paragraphs)
            # catch the first document as a special case
            if(i==0):
                document_paragraph_list = [p for p in range(number_of_paragraphs)]
            else:
                # check if the last paragraph of the last document exists, if this is empty, that means that the last document has had 0 paragraphs and therefore we take second last document
                last_paragraph_index = self.data_object.doc_2_paragraph_index[-1][-1]
                new_last_paragraph_index = last_paragraph_index + number_of_paragraphs
                document_paragraph_list = [p for p in range(last_paragraph_index +1, new_last_paragraph_index+1)]
            # put the paragraphs in the respective index
            for j, _ in enumerate(paragraphs):
                # for every paragraph put the respective document in the list -> [0,0,0,0,1,1,1,] -> self.data_object.paragraph_2_doc_index[paragraph_id]= document_id
                self.data_object.paragraph_2_doc_index.append(i)
            # append the document corresponding paragraph list to the doc2paragraph index -> self.data_object.doc_2_paragraph_index[document_id] = [23,25,26,27,28] (=list of paragraph indices)
            self.data_object.doc_2_paragraph_index.append(document_paragraph_list) 
        print('DONE: paragraph corpus and doc_2_par indices successfully created.')
        
    def create_topic_model(self, top_n_words=10, calculate_probabilities=True, n_gram_range_upper_bound=1, number_of_best_matching_docs=10, \
                           extractive_summarization_method='lex_rank', abstractive_summarization_method=None): # bart_summarizer
        print('... creating topic model.')
        #self.topic_model = BERTopic(embedding_model=self.sentence_model, top_n_words=top_n_words).fit(self.data_object.paragraph_corpus)
        topic_model = BERTopic(embedding_model= self.sentence_model, top_n_words=top_n_words, calculate_probabilities=calculate_probabilities,  n_gram_range=(1, n_gram_range_upper_bound))
        # train topic model 
        paragraph_to_topic_map, probabilities = topic_model.fit_transform(self.data_object.paragraph_corpus)
        #print(topic_model.get_topic_info())
        # create topic corpus
        documents_list = []
        paragraphs_list = []
        paragraph_topics_list = []
        topic_probabilities_list = []
        for index, topic in enumerate(tqdm(paragraph_to_topic_map)):
            if(topic != -1):
                #print('doc: ' + str(map_creator_repository_impl.data_object.paragraph_2_doc_index[index]) +  ', par: ' + str(index) + ', topic: ' + str(topic) + ', probability: ' + str(probs[index][topic]))
                current_document = self.data_object.paragraph_2_doc_index[index]
                documents_list.append(current_document)
                current_paragraph = index
                paragraphs_list.append(current_paragraph)
                current_topic = topic
                paragraph_topics_list.append(current_topic)
                current_max_probability = probabilities[index][topic]
                topic_probabilities_list.append(current_max_probability)
            #else:
            #    print('no topic detected.')
        # CREATE dataframe from lists, solution found here: https://www.geeksforgeeks.org/create-a-pandas-dataframe-from-lists/
        topic_2_source_df = pd.DataFrame(list(zip(documents_list, paragraphs_list, paragraph_topics_list, topic_probabilities_list)),
                       columns =['document_index', 'paragraph_index', 'topic_index', 'topic_probability'])
        # SORT dataframe for 1st: topic_index, 2nd: topic_probability, solution found here: https://datatofish.com/sort-pandas-dataframe/, ascending=false -> we sort to last topic and highest probability
        topic_2_source_df.sort_values(by=['topic_index','topic_probability'], inplace=True, ascending=False)
        #print(topic_2_source_df.head())
        # EXTRACT/READ dataframe for every topic_index, so that we can find the most relevant sources for each topic, solution found here: https://pandas.pydata.org/pandas-docs/stable/getting_started/intro_tutorials/03_subset_data.html
        all_topics_df = topic_model.get_topic_info()
        #print(all_topics_df.head())
        topic_extend = all_topics_df['Topic'].tolist()
        topic_min = min(topic_extend)
        if(topic_min != -1):
            print('ERROR: no -1 topic detected.')
        #print(topic_min)
        topic_max = max(topic_extend)
        #print(topic_max)
        topic_corpus = []
        for topic_index in range(0, topic_max+1): # we need to include the topic_max index -> therefore +1
            current_topic_dataframe = topic_2_source_df[topic_2_source_df["topic_index"] == topic_index] 
            # get the documents as a list and, make a set out of it (=remove duplicates) and take the top n out of it
            current_topic_documents = current_topic_dataframe['document_index'].tolist() # https://stackoverflow.com/questions/22341271/get-list-from-pandas-dataframe-column-or-row
            current_topic_best_matching_documents_indices = list(set(current_topic_documents))[:number_of_best_matching_docs] # this is accurate, because we SORTED ascending before!
            #print(current_topic_best_matching_documents_indices)
            # store topic_model into a json file which can be used by frontend, format: [{topic_index:0, topic_words:['test', 'koala'], best_matching_documents_indices:[16, 3, 2, 3, 5, 6]}]
            single_topic_words = topic_model.get_topic(topic_index)
            single_topic_word_list = []
            for word in single_topic_words:
                single_topic_word_list.append(word[0])
            #print(single_topic_word_list)
            current_topic_info_df = all_topics_df[all_topics_df["Topic"] == topic_index] 
            #print(current_topic_info_df)
            # get value of single cell in datafrane, solution found here: https://stackoverflow.com/questions/16729574/how-to-get-a-value-from-a-cell-of-a-dataframe
            current_topic_name = current_topic_info_df.iloc[0]['Name']
            #print(current_topic_name)
            current_topic_size = current_topic_info_df.iloc[0]['Count']
            # create topic summary from paragraphs of the best matching sources -> first create extractive summary from the paragraphs and then make it nice using the huggingface abstractive summarization pipeline
            current_best_matching_paragraph_indices = current_topic_dataframe['paragraph_index'].tolist()
            topic_summary = ''
            best_matching_paragraphs_concatenated = ''
            if(extractive_summarization_method != None):
                for paragraph_index in current_best_matching_paragraph_indices[:5]: # 5 works well!
                    paragraph_text = self.data_object.paragraph_corpus[paragraph_index]
                    best_matching_paragraphs_concatenated = best_matching_paragraphs_concatenated + '. ' + paragraph_text
                if(extractive_summarization_method == 'text_rank' and len(best_matching_paragraphs_concatenated) > 30): # make sure we have a minimum length we can summarize
                    # create extractive summary using sumy, solution found here: https://jcharistech.wordpress.com/2019/01/05/how-to-summarize-text-or-document-with-sumy/
                    parser = PlaintextParser.from_string(best_matching_paragraphs_concatenated,Tokenizer("english"))
                    text_rank_summarizer = TextRankSummarizer()
                    text_rank_summary = text_rank_summarizer(parser.document,5)
                    text_rank_summary_sentences = []
                    for sentence in text_rank_summary:
                        text_rank_summary_sentences.append(str(sentence))
                    rank_summary = ' '.join([str(sentence) for sentence in text_rank_summary_sentences])
                    #print(text_rank_summary)
                elif(extractive_summarization_method == 'lex_rank' and len(best_matching_paragraphs_concatenated) > 30): 
                    sentences = nltk.sent_tokenize(best_matching_paragraphs_concatenated)
                    # eliminate too short sentences
                    sentences = [sentence for sentence in sentences if len(sentence) > 2]
                    embeddings = self.sentence_model.encode(sentences, convert_to_tensor=True)
                    embeddings = embeddings.cpu()
                    cos_scores = util.cos_sim(embeddings, embeddings).numpy()
                    centrality_scores = degree_centrality_scores(cos_scores, threshold=None)
                    most_central_sentence_indices = np.argsort(-centrality_scores)
                    lex_rank_summary_sentences_indices = most_central_sentence_indices[0:5]
                    rank_summary = ' '.join([str(sentences[index]) for index in lex_rank_summary_sentences_indices])
                #elif(): https://towardsdatascience.com/understand-text-summarization-and-create-your-own-summarizer-in-python-b26a9f09fc70, 
                else:
                    print('ERROR: invalid text summarization method given OR concatenated paragraphs to short.')
                    rank_summary = 'empty'
                if(abstractive_summarization_method != None):
                    # create abstractive summary using huggingface summarizer pipeline, ALTERNATIVE: https://github.com/UKPLab/sentence-transformers/blob/master/examples/applications/text-summarization/text-summarization.py
                    summarizer = pipeline("summarization") # https://huggingface.co/transformers/main_classes/pipelines.html#transformers.SummarizationPipeline
                    topic_summary = summarizer(rank_summary, min_length=40, max_length=100, do_sample=False)[0]['summary_text']
                else:
                    topic_summary = rank_summary
            #print(topic_summary)                  
            #print(current_topic_size)
            current_topic_object = {
            'topic_index': str(topic_index), 
            'topic_name':  str(current_topic_name),
            'topic_size': str(current_topic_size),
            'topic_words': single_topic_word_list, 
            'topic_summary': topic_summary,
            'best_matching_documents_indices': current_topic_best_matching_documents_indices
            }
            topic_corpus.append(current_topic_object)
        self.topic_corpus = topic_corpus
        print('DONE: topic model successfully created.')
        
    def get_topic_info(self, max_topic_number:int):
        info_df = self.topic_model.get_topic_info().head(max_topic_number)
        return info_df
    
    def get_topic_corpus(self):
        topic_corpus = self.topic_corpus
        return topic_corpus
        
    def create_topic_vectors(self):
        print('... creating topic vectors for corpus.')
        topic_vectors_np_array = self.topic_model.topic_embeddings
        topic_labels = []
        for t in range(-1, len(topic_vectors_np_array)-1): # topic label -1 is the label that captures all the words that do not 
            topic_labels.append(str(t))
        #print(len(topic_vectors_np_array))
        #print(topic_vectors[0])
        # convert topic vectors from numpy to pandas dataframe
        topic_vectors_df = pd.DataFrame(topic_vectors_np_array)
        #print(topic_vectors_df.shape)
        self.data_object.topic_vectors = topic_vectors_df
        self.data_object.topic_labels = topic_labels
        print('DONE: topic vectors successfully created.')
        
    def create_document_vectors(self):
        print('...creating document vectors for corpus.')
        # get interactive document corpus
        document_summary_corpus = []
        for i, interactive_document in enumerate(tqdm(self.data_object.interactive_document_corpus)):
            # check if the abstract of the document exists, if yes get the abstract of the respective document, if no put an empty abstract
            if(interactive_document.summary!= None):
                document_summary = interactive_document.summary
            else:
                document_summary = ''
            # add the abstract to the corpus
            document_summary_corpus.append(document_summary)
        # create labels for the document_vectors
        document_labels = []
        # if we use the summaries only, then every document is represented by its abstract/summary and therefore its label is its ID/index in the interactive document corpus
        for index in range(len(self.data_object.interactive_document_corpus)):
            document_labels.append(str(index))
        # embedd the summaries
        document_summary_embeddings = self.get_document_embeddings_for_corpus(document_summary_corpus)
        #  turn list of embedding vectors into pandas dataframe
        document_summary_embeddings_np_array = [i.numpy() for i in document_summary_embeddings] 
        document_summary_embeddings_df = pd.DataFrame(document_summary_embeddings_np_array)
        # set the embeddings of the abstracts as the document_vectors of the data_object
        self.data_object.document_vectors = document_summary_embeddings_df
        self.data_object.document_labels = document_labels
        print('DONE: document_vectors created successfully.')
        
    def create_paragraph_vectors(self):
        print('...creating paragraph vectors for corpus.')
        # with using paragraph vectors the every document is split into paragraphs and every paragraph is embedded into a single vector and a map is created from that. 
        document_paragraph_corpus = self.data_object.paragraph_corpus
        document_paragraph_labels = []
        for paragraph_index in range(len(self.data_object.paragraph_corpus)):
            document_paragraph_labels.append(self.data_object.paragraph_2_doc_index[paragraph_index])
        # embedd the paragraphs
        document_paragraph_embeddings = self.get_document_embeddings_for_corpus(document_paragraph_corpus)
        #  turn list of embedding vectors into pandas dataframe
        document_paragraph_embeddings_np_array = [i.numpy() for i in document_paragraph_embeddings] 
        document_paragraph_embeddings_df = pd.DataFrame(document_paragraph_embeddings_np_array)
        # set the embeddings of the abstracts as the document_vectors of the data_object
        self.data_object.paragraph_vectors = document_paragraph_embeddings_df
        self.data_object.paragraph_labels = document_paragraph_labels
        print('DONE: document_vectors created successfully.')
        
    def create_contextualized_word_vectors(self, use_summaries_only=False):
        print('... creating contextualized word vectors.')
        # aggregation data structures to aggregate all the embedding frames from all paragraphs in all documents
        all_contextualized_embedding_data_frames = []
        all_contextualized_word_labels = []
        all_contextualized_word_2_doc_mappings = []
        # for every document get the paragraphs and the keywords
        for i, interactive_document in enumerate(tqdm(self.data_object.interactive_document_corpus)):
            document_keywords = interactive_document.keywords
            document_paragraph_indices = self.doc_id_2_paragraph_ids(i)
            document_paragraphs = []
            if(use_summaries_only):
                if(interactive_document.summary!= None):
                    summary = interactive_document.summary
                    # restrict the summary to a length of 300 words
                    summary = interactive_document.summary.split()
                    summary = summary[:300]
                    summary = ' '.join(summary)
                else:
                    summary = ''
                document_paragraphs.append(summary)
            else:
                for paragraph_index in document_paragraph_indices:
                    document_paragraphs.append(self.data_object.paragraph_corpus[paragraph_index])
            # for every paragraph get the contextualized word embeddings of the document keywords
            for index, paragraph in enumerate(document_paragraphs):
                contextualized_embeddings, labels = self.get_paragraph_contextualized_word_embeddings(paragraph, document_keywords)
                contextualized_embeddings_np_array = [i.numpy() for i in contextualized_embeddings] 
                contextualized_embeddings_df = pd.DataFrame(contextualized_embeddings_np_array)
                # lemmatize and clean the labels
                # TODO: lemmatize and clean the labels based on SPACY and after Lemmatizing and cleaning the labels ALSO lemmatize and clean the keywords of the document!!!
                labels = self.lemmatize_and_clean_labels(labels)
                # lemmatize and clean the keywords respectively and update them in the interactive document object, IMPORTANT: check if we do this AFTER getting all labels (= after the last paragraph)
                if(index == len(document_paragraphs)-1):
                    cleaned_document_keywords = self.lemmatize_and_clean_labels(document_keywords)
                    # remove duplicates
                    cleaned_document_keywords = list(set(cleaned_document_keywords))
                    interactive_document.keywords = cleaned_document_keywords
                # for every contextualized word, we create a mapping to the document it comes from
                contextualized_word_2_doc_mappings = []
                for label in labels:
                    contextualized_word_2_doc_mappings.append(i)
                # put the new data frame into the list of all dataframes
                all_contextualized_embedding_data_frames.append(contextualized_embeddings_df)
                # remember the length/number of the respective paragraphs points
                # number_of_single_source_embeddings.append(contextualized_embeddings_df.shape[0])
                # concatenate the labels 
                all_contextualized_word_labels = all_contextualized_word_labels + labels
                # concatenate the cword 2 doc mappings
                all_contextualized_word_2_doc_mappings = all_contextualized_word_2_doc_mappings + contextualized_word_2_doc_mappings 
        # concatenate all the pandas data frames of the single paragraphs
        concatenated_contextualized_embeddings_df = pd.concat(all_contextualized_embedding_data_frames).reset_index(drop=True)
        #print(concatenated_context_embeddings_df.shape)
        # save the contextualized_word_vectors and the labels from all documents in the data_object variable
        self.data_object.contextualized_word_vectors = concatenated_contextualized_embeddings_df
        self.data_object.contextualized_word_labels = all_contextualized_word_labels
        self.data_object.contextualized_word_2_doc_index = all_contextualized_word_2_doc_mappings
        print('DONE: contextualized word vectors successfully created.')
        
    def get_document_embedding(self, document):
        # get the document_embedding vector from the sentence transformer
        #Sentences are encoded by calling model.encode()
        document_embedding = self.sentence_model.encode(document, convert_to_tensor=True)
        document_embedding = document_embedding.cpu()
        return document_embedding
    
    def get_document_embeddings_for_corpus(self, corpus):
        # corpus = list of documents, like: corpus = ['This framework generates embeddings for each input sentence','Sentences are passed as a list of string.','The quick brown fox jumps over the lazy dog.']
        # embedd the whole corpus at once
        document_embeddings = self.sentence_model.encode(corpus, convert_to_tensor=True)
        document_embeddings = document_embeddings.cpu()
        return document_embeddings
        
    def get_paragraph_contextualized_word_embeddings(self, paragraph, document_keywords, label_lemmatization=False):
        # check if paragraph is empty, if that is the case return empty lists
        if(paragraph==''):
            contextualized_embeddings = []
            labels = [] 
            return contextualized_embeddings, labels 
        # 1. turn paragraph into a list of tokens
        paragraph_text = self.text_processor(paragraph)
        paragraph_tokens = []
        # transform text into token list
        for token in paragraph_text:
            paragraph_tokens.append(token.text)
        # get the indices of the keywords in the tokenized paragraph text
        keyword_indices = self.get_keyword_indices_in_paragraph(paragraph_tokens, document_keywords)
        # tokenize the paragraph for the BERT encoder
        ids = self.bert_tokenizer.encode(paragraph, add_special_tokens = True, truncation = True, padding = "max_length", max_length=512)
        sub_tokens = self.bert_tokenizer.convert_ids_to_tokens(ids)
        #print(len(sub_tokens))
        #print(sub_tokens)
        # get the whole word indices from the subtokenized bert tokens
        full_word_indices = self.get_full_word_indices_from_tokens(paragraph_tokens, sub_tokens)
        #print(full_word_indices)
        # get contextualized word/BERT embedding tokens for whole document
        all_contextualized_word_embeddings = self.get_contextualized_word_embeddings_for_paragraph(paragraph)
        all_contextualized_word_embeddings = all_contextualized_word_embeddings.cpu()
        #print(len(all_contextualized_word_embeddings))      
        # filter out the embeddings of the selected document keywords
        contextualized_embeddings = []
        labels = [] 
        #print('keyword indices: ' + str(keyword_indices))
        for j, keyword_index in enumerate(keyword_indices):
            # get the respective list from the full_word_indices and take the last token out of the list as the representative one for the whole word
            try:
                keyword_subtoken_list = full_word_indices[keyword_index]
            except:
                print('WARNING: cound not find keyword in abstract. this happens due to the fact that the embedding length of an abstract is restricted to max_length=512 tokens')
                continue
            # check if the keyword_subtoken_list is empty -> if this is the case, CONTINUE. THIS CAN HAPPEN e.g. when using ABSTRACTS/SUMMARIES only version, because in these cases
            # there might be more keywords detected in the document, but we cannot find an embedding for them IN THE ABSTRACT!!! => therefore just continue in the loop!
            if(len(keyword_subtoken_list)==0):
                continue
            #print(keyword_subtoken_list)
            # get last element of the keyword_token_list and get the embedding for this
            representative_token = keyword_subtoken_list[-1]
            try:
                contextualized_embedding = all_contextualized_word_embeddings[representative_token]
                contextualized_embeddings.append(contextualized_embedding)
                label = paragraph_tokens[keyword_index]
                labels.append(label)
            except:
                print('error:')
                print(keyword_index)
                print(keyword_subtoken_list)
                print(full_word_indices)
        
        return contextualized_embeddings, labels 
    
    def get_keyword_indices_in_paragraph(self, paragraph_tokens, document_keywords):
        keyword_indices = []
        for i, word in enumerate(paragraph_tokens):
            if word in document_keywords:
                keyword_indices.append(i)
            else:
                continue
        return keyword_indices
     
    def get_full_word_indices_from_tokens(self, full_tokens, sub_tokens):
        '''
        full_tokens: single word tokens produced by self.text_processor
        sub_tokens: sub word tokens produced by BERTTokenizer    
        '''
        bert_encoder_max_length = 512
        # subtoken to token mapping found here: https://github.com/tensorflow/text/issues/275
        curr_index = -1 # index is incremented before any access to the array, so this is a smart way to initialize
        count = 0
        full_word_indices = [ [] for _ in range(bert_encoder_max_length) ] #  len(document_tokens)+2 => add CLS and SEP tokens
        #print(len(document_tokens))
        #print(len(tokens))
        #print(len(full_word_indexes))
        for i, token in enumerate(sub_tokens):
            if token[:2] != '##':
                curr_index += 1
                count = 0
            full_word_indices[curr_index].append(i) 
            count += 1
        return full_word_indices 
    
    def get_contextualized_word_embeddings_for_paragraph(self, paragraph):
        paragraph_contextualized_word_embeddings = self.sentence_model.encode(paragraph, convert_to_tensor=True, output_value='token_embeddings')
        return paragraph_contextualized_word_embeddings
    
    def lemmatize_and_clean_labels(self, labels):
        lemmatizer = WordNetLemmatizer()
        cleaned_labels = []
        for label in labels:
            # replace the hashtags
            label = label.replace('##', '')
            #print(label)
            # lemmatize the label
            label = lemmatizer.lemmatize(label)
            #print(label)
            cleaned_labels.append(label)
        return cleaned_labels
        # Using Spacy lemmatization api
        #for token in paragraph_text:
        #    paragraph_tokens.append(token.lemma.lower())
    
    
    def create_topic_map(self):
        # TODO!
        print('... creating topic map.')
    
    def create_document_map(self):
        print('... creating document map')
        assert not self.data_object.document_vectors.empty, "ERROR: no document_vectors created, yet." 
        document_vectors_df = self.data_object.document_vectors
        umap_reduced_document_map_df = self.umap_dimensionality_reduction(document_vectors_df)
        document_labels = self.data_object.document_labels
        doc_to_doc_index = []
        for d in range(len(self.data_object.document_labels)):
            doc_to_doc_index.append(-999) # we choose -999 as the placeholder to mark documents in this case, because it is highly unlikely to have so many topic levels
        umap_reduced_document_map_df['word_2_doc_index'] = doc_to_doc_index
        self.data_object.document_map = umap_reduced_document_map_df
        print('DONE: document map successfully created.')
        
    def create_paragraph_map(self):
        print('... creating paragraph map.')
        assert not self.data_object.paragraph_vectors.empty, "ERROR: no document_vectors created, yet." 
        paragraph_vectors_df = self.data_object.paragraph_vectors
        umap_reduced_paragraph_map_df = self.umap_dimensionality_reduction(paragraph_vectors_df)
        paragraph_labels = self.data_object.paragraph_labels
        umap_reduced_paragraph_map_df['paragraph_2_doc_index'] = paragraph_labels
        # for each paragraph get 2 most important keywords
        tr4w = TextRank4Keyword()
        all_paragraph_keywords = []
        for index, paragraph in enumerate(self.data_object.paragraph_corpus):
            tr4w.analyze(paragraph, candidate_pos = ['NOUN', 'PROPN', 'VERB'], window_size=5, lower=True)
            number_of_keywords_for_document = 2
            paragraph_keywords = tr4w.get_keywords(number_of_keywords_for_document)
            all_paragraph_keywords.append(paragraph_keywords)
        umap_reduced_paragraph_map_df['paragraph_keywords'] = all_paragraph_keywords
        self.data_object.paragraph_map = umap_reduced_paragraph_map_df
        print('DONE: paragraph map successfully created.')
        
    def create_contextualized_word_map(self):
        print('...creating contextualized_word_map.')
        assert not self.data_object.contextualized_word_vectors.empty, "ERROR: no contextualized_word_vectors created, yet." 
        contextualized_word_vectors_df = self.data_object.contextualized_word_vectors
        umap_reduced_contextualized_word_map_df = self.umap_dimensionality_reduction(contextualized_word_vectors_df)
        umap_reduced_contextualized_word_map_df['label'] = self.data_object.contextualized_word_labels
        umap_reduced_contextualized_word_map_df['word_2_doc_index'] = self.data_object.contextualized_word_2_doc_index
        self.data_object.contextualized_word_map = umap_reduced_contextualized_word_map_df
        print('DONE: contextualized word map successfully created.')
        
    def create_semantic_map(self):
        # check if we have all the relevant object variables
        assert not self.data_object.topic_vectors.empty, "ERROR: no topic_vectors created, yet." 
        assert not self.data_object.contextualized_word_vectors.empty, "ERROR: no contextualized_word_vectors created, yet." 
        assert not self.data_object.document_vectors.empty, "ERROR: no document_vectors created, yet." 
        print('... creating semantic map.')
        # stack the topic vectors and the contextualized word vectors above each other to reduce them later in one step in a single umap map
        topic_vectors_df = self.data_object.topic_vectors
        document_vectors_df = self.data_object.document_vectors
        contextualized_word_vectors_df = self.data_object.contextualized_word_vectors
        stack_list = []
        stack_list.append(topic_vectors_df)
        stack_list.append(document_vectors_df)
        stack_list.append(contextualized_word_vectors_df)
        complete_semantic_map_df = pd.concat(stack_list).reset_index(drop=True)
        #print(complete_semantic_map_df.shape)
        # UMAP the whole STACK (topic vectors + contextualized word vectors) onto a sphere
        complete_umap_reduced_semantic_map_df = self.umap_dimensionality_reduction(complete_semantic_map_df)
        # append labels column
        complete_labels = self.data_object.topic_labels + self.data_object.document_labels + self.data_object.contextualized_word_labels 
        complete_umap_reduced_semantic_map_df['label'] = complete_labels
        # append contextualized word 2 doc mapping column for later QUANTIZATION
        # for the topic_vectors we append the label -1, because they do not belong to a document
        topic_to_doc_index = []
        for l in range(len(self.data_object.topic_labels)):
            topic_to_doc_index.append(-1)
        # for the document_vectors we append the label -1, because they do not belong to a document
        doc_to_doc_index = []
        for d in range(len(self.data_object.document_labels)):
            doc_to_doc_index.append(-999) # we choose -999 as the placeholder to mark documents in this case, because it is highly unlikely to have so many topic levels
        complete_index = topic_to_doc_index + doc_to_doc_index + self.data_object.contextualized_word_2_doc_index
        complete_umap_reduced_semantic_map_df['word_2_doc_index'] = complete_index
        #print(concatenated_umap_embedding_df.shape)
        # store the created semantic in the data_object variables
        self.data_object.semantic_map = complete_umap_reduced_semantic_map_df
        print('DONE: semantic map successfully created.')
        
        
    def umap_dimensionality_reduction(self, dataframe, output_metric='haversine'):
        if(output_metric == 'haversine'):
            print('UMAP: dim reduction using HAVERSINE output metric.')
            umap_model = umap.UMAP(n_neighbors=15, min_dist=0.0, metric='cosine', output_metric='haversine', random_state=42)
            umap_model.fit(dataframe)
            # print(umap_embeddings.shape)
            # OPTIONAL: make a pandas dataframe out of the embedding tensor
            # umap_embedding_df = pd.DataFrame(umap_embeddings)
            # spherical projection from here: https://umap-learn.readthedocs.io/en/latest/embedding_space.html#spherical-embeddings
            # formulas from here: https://vvvv.org/blog/polar-spherical-and-geographic-coordinates#:~:text=In%20order%20to%20match%20the,a%20longitude%20of%200%C2%B0.
            x = np.sin(umap_model.embedding_[:, 0]) * np.cos(umap_model.embedding_[:, 1])
            y = np.sin(umap_model.embedding_[:, 0]) * np.sin(umap_model.embedding_[:, 1])
            z = np.cos(umap_model.embedding_[:, 0])
            ## lat / lon conversion -> x -> lat, y -> lon, for checking: lat range = -90 and +90 , lon range = -180 and +180
            #lon = np.arctan2(y, x) # lon ranges produced by this are in [-pi,pi]|[-180,180], which is equivalent to earth mapping in degree [-180,180] [-pi/pi] OK
            lon = np.arctan2(x, y) # IMPORTANT CHANGE!!!! -> use x,y here, otherwise the points are mirrored!!! and the projection in frntend will not work!
            lat = np.arccos(z)# lat ranges produced by this are in [-pi, 0]|[-180,0], which is NOT equivalent to earth mapping in degree [-90, 90] || 
            lat = lat - math.pi/2 # = -90° 
            ## convert rad to degree: Degree = Radians * (180 / PI)
            lon = lon*(180/(3.14159)) # divide by math.pi ~ 3.14159
            lat = lat*(180/(3.14159)) # divide by math.pi ~ 3.14159
            ## truncating numpy floats, otherwise pandas will not round correctly when dividing by irrational number py, found here: https://stackoverflow.com/questions/42021972/truncating-decimal-digits-numpy-array-of-floats
            lon = self.trunc(lon, decs=6)
            lat = self.trunc(lat, decs=6)
            # store in dataframe
            umap_embedding_df = pd.DataFrame(columns=['lat', 'lon'])
            umap_embedding_df['lat'] = lat
            umap_embedding_df['lon'] = lon
        elif(output_metric == 'euclidean'):
            print('UMAP: dim reduction using EUCLIDEAN output metric.')
            umap_model = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, metric='cosine', random_state=42)
            umap_model.fit(dataframe)
            x = umap_model.embedding_.T[0]
            y = umap_model.embedding_.T[1]
            umap_embedding_df = pd.DataFrame(columns=['x', 'y'])
            umap_embedding_df['x'] = x
            umap_embedding_df['y'] = y
        else:
            umap_embedding_df = pd.DataFrame(columns=['0', '1'])
            print('ERROR: no valid metric provided for umap dimensionality reduction. CHOOSE: haversine or euclidean')
        return umap_embedding_df   
    
    def plot_dataframe(self, dataframe):
        print('... plotting dataframe.')
        umap_model = umap.UMAP(n_neighbors=15, min_dist=0.0, metric='cosine', output_metric='haversine', random_state=42)
        umap_model.fit(dataframe)
        # print(umap_embeddings.shape)
        # OPTIONAL: make a pandas dataframe out of the embedding tensor
        # umap_embedding_df = pd.DataFrame(umap_embeddings)
        # spherical projection from here: https://umap-learn.readthedocs.io/en/latest/embedding_space.html#spherical-embeddings
        # v/phi/latitude, u/theta/longitute, phi/theta/rho are just another name for latitude, longitude, and altitude.
        ## formulas from here: https://vvvv.org/blog/polar-spherical-and-geographic-coordinates#:~:text=In%20order%20to%20match%20the,a%20longitude%20of%200%C2%B0.
        x = np.sin(umap_model.embedding_[:, 0]) * np.cos(umap_model.embedding_[:, 1]) # sin(v) * cos(u)
        y = np.sin(umap_model.embedding_[:, 0]) * np.sin(umap_model.embedding_[:, 1]) # sin(v) * cos(v)
        z = np.cos(umap_model.embedding_[:, 0]) # = cos(v)
        # define plot sizes
        plt.rcParams["figure.figsize"] = (15,15)
        # plot on 3D sphere
        #Visualize lat/lon projection like found here: https://umap-learn.readthedocs.io/en/latest/embedding_space.html
        fig = plt.figure(0)
        ax = fig.add_subplot(111, projection='3d')
        ax.scatter(y, x, z, cmap='Spectral') # check coordinate system axis -> swap x and y !! because of coordinate system of earth
        ax.set_xlabel('Y axis')
        ax.set_ylabel('X axis')
        ax.set_zlabel('Z axis')
        # plot 2D mapping of 3D sphere  
        # lat / lon conversion -> x -> lat, y -> lon, for checking: lat range = -90 and +90 , lon range = -180 and +180
        # lon = np.arctan2(y, x) # lon ranges produced by this are in [-pi,pi]|[-180,180], which is equivalent to earth mapping in degree [-180,180] [-pi/pi] OK
        lon = np.arctan2(x, y) # IMPORTANT CHANGE!!!! -> use x,y here, otherwise the points are mirrored!!! and the projection in frntend will not work!
        lat = np.arccos(z)# lat ranges produced by this are in [-pi, 0]|[-180,0], which is NOT equivalent to earth mapping in degree [-90, 90] || [-pi/2, pi/2] => NOT OK -> add pi/2 to lon?!
        lat = lat - math.pi/2 # = -90° 
        ## convert rad to degree: Degree = Radians * (180 / PI)
        lon = lon*(180/(3.14159)) # divide by math.pi ~ 3.14159
        lat = lat*(180/(3.14159)) # divide by math.pi ~ 3.14159
        ## truncating numpy floats, otherwise pandas will not round correctly when dividing by irrational number py, found here: https://stackoverflow.com/questions/42021972/truncating-decimal-digits-numpy-array-of-floats
        lon = self.trunc(lon, decs=6)
        lat = self.trunc(lat, decs=6)
        plt.figure(1)
        plt.scatter(lon, lat, cmap='Spectral')
        plt.xlabel('lon-axis', fontsize=18)
        plt.ylabel('lat-axis', fontsize=16)
        # plot 2d plane
        umap_plane_model = umap.UMAP(n_neighbors=15, min_dist=0.0, n_components=2, metric='cosine', random_state=42).fit(dataframe)
        plt.figure(2)
        plt.scatter(umap_plane_model.embedding_.T[0], umap_plane_model.embedding_.T[1] , cmap='Spectral')
     
    def get_quantized_corpus(self, number_of_decimals:int):
        '''
        number_of_decimals: quantizes every point of the pandas data frame to the given number of decimals and then aggregates the complete_semantic_map 
                            into an aggregated version
        '''
        print('.. quantizing corpus map points to {0} number of decimals.'.format(number_of_decimals))
        reduced_contextualized_word_vectors_df = self.get_contextualized_word_map()
        #print(reduced_contextualized_word_vectors.shape)
        # QUANTIZE WORD VECTORS: quantize the lat and lon column to the given number of decimals
        reduced_contextualized_word_vectors_df = reduced_contextualized_word_vectors_df.round({'lat': number_of_decimals, 'lon': number_of_decimals}) # df['c']=df['c'].apply(lambda x:np.round(x,number_of_decimals))
        #print(reduced_contextualized_word_vectors.head(5))
        # sort the rows ascending to their document number AND \
        # aggregate the points over the lat lon values
        reduced_contextualized_word_vectors_df = reduced_contextualized_word_vectors_df.groupby(['word_2_doc_index','lat','lon'])['label'].apply(list) #.apply(lambda x: x.sum())
        # reset index to get grouped columns back
        reduced_contextualized_word_vectors_df = reduced_contextualized_word_vectors_df.reset_index() 
        reduced_contextualized_word_vectors_df.columns = ['word_2_doc_index', 'lat', 'lon', 'labels']
        #print(reduced_contextualized_word_vectors_df.head(10))
        #print(quantized_semantic_map_df.shape)
        #print(quantized_semantic_map_df.head(10))
        print('DONE: corpus map points successfuyll quantized.')
        return reduced_contextualized_word_vectors_df
    
    def get_quantized_semantic_map(self, number_of_decimals:int):
        '''
        number_of_decimals: quantizes every point of the pandas data frame to the given number of decimals and then aggregates the complete_semantic_map 
                            into an aggregated version
        '''
        print('.. aggregating the semantic map to a base map'.format(number_of_decimals))
        # TODO: QUANTIZE ADAPTIVELY!!! compute the point density in every bin/degree bin and decide then to quantize lower or higher to get optimal map for visualization!!!!
        semantic_map_df = self.get_semantic_map()
        # take the word vectors out of the semantic map
        reduced_contextualized_word_vectors_df = semantic_map_df.loc[semantic_map_df['word_2_doc_index'] >= 0] # semantic_map_df["word_2_doc_index"] >= 0
        #print(reduced_contextualized_word_vectors.shape)
        # QUANTIZE WORD VECTORS: quantize the lat and lon column to the given number of decimals
        reduced_contextualized_word_vectors_df = reduced_contextualized_word_vectors_df.round({'lat': number_of_decimals, 'lon': number_of_decimals})
        # get out the lat, lon and labels columns because they are the only we need
        reduced_contextualized_word_vectors_df = reduced_contextualized_word_vectors_df[['lat', 'lon', 'label']]
        # aggregate the points over the lat lon values
        quantized_semantic_map_df = reduced_contextualized_word_vectors_df.groupby(['lat','lon'])['label'].apply(list) #.apply(lambda x: x.sum())
        # reset index to get grouped columns back
        quantized_semantic_map_df = quantized_semantic_map_df.reset_index() 
        quantized_semantic_map_df.columns = ['lat', 'lon', 'labels']
        # assign every point in the quantized base map to a cluster
        topic_cluster_assigned_quantized_semantic_map = self.assign_clusters_to_points_of_df(quantized_semantic_map_df)
        print('DONE: semantic map successfuyll aggregated to a base map.')
        return topic_cluster_assigned_quantized_semantic_map
    
    def get_quantized_base_map(self, number_of_decimals:int):
        '''
        number_of_decimals: quantizes every point of the pandas data frame to the given number of decimals and then aggregates the complete_semantic_map 
                            into an aggregated version
        '''
        print('.. aggregating the contextualized word map to a base map'.format(number_of_decimals))
        # TODO: QUANTIZE ADAPTIVELY!!! compute the point density in every bin/degree bin and decide then to quantize lower or higher to get optimal map for visualization!!!!
        contextualized_word_map_df = self.get_contextualized_word_map()
        #print(contextualized_word_map_df.shape)
        # QUANTIZE WORD VECTORS: quantize the lat and lon column to the given number of decimals
        reduced_contextualized_word_vectors_df = contextualized_word_map_df.round({'lat': number_of_decimals, 'lon': number_of_decimals})
        # get out the lat, lon and labels columns because they are the only we need
        reduced_contextualized_word_vectors_df = reduced_contextualized_word_vectors_df[['lat', 'lon', 'label']]
        # aggregate the points over the lat lon values
        quantized_base_map_df = reduced_contextualized_word_vectors_df.groupby(['lat','lon'])['label'].apply(list) #.apply(lambda x: x.sum())
        # reset index to get grouped columns back
        quantized_base_map_df = quantized_base_map_df.reset_index() 
        quantized_base_map_df.columns = ['lat', 'lon', 'labels']
        # assign every point in the quantized base map to a cluster
        topic_cluster_assigned_quantized_base_map = self.assign_clusters_to_points_of_df(quantized_base_map_df)
        print('DONE: contextualized word map successfuyll aggregated to a base map.')
        return topic_cluster_assigned_quantized_base_map
    
    def assign_clusters_to_points_of_df(self, dataframe):
        '''dataframe: a pandas datframe that comes with the first two columns as lat lon meaning ['lat', 'lon', '..', '...']'''
        print('--- assigning clusters to points of dataframe.')
        # get a subframe that contains the coordinates in lat lon
        coordinates_df = dataframe[['lat', 'lon']]
        # convert the degrees into radians 
        coordinates_df['lat'] = coordinates_df['lat'] / (180/(3.14159))
        coordinates_df['lon'] = coordinates_df['lon'] / (180/(3.14159))
        # apply hdbscan
        hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=10, metric='haversine', cluster_selection_method='eom', prediction_data=True)
        hdbscan_model.fit(coordinates_df)
        #print(hdbscan_model.labels_.max())
        #print(hdbscan_model.labels_)
        #print(hdbscan_model.probabilities_)
        # append the created labels and label probabilities to the original dataframe
        dataframe['topic_label'] = hdbscan_model.labels_
        dataframe['topic_label_probability'] = hdbscan_model.probabilities_
        print('DONE: clusters successfuyll assigned to points of dataframe.')
        return dataframe
        
    def get_topic_map(self, topic_level:int):
        print('... extracting topic map out of whole semantic map.')
        semantic_map_df = self.get_semantic_map()
        # take the topic vectors out of the semantic map
        reduced_topic_vectors_df = semantic_map_df.loc[semantic_map_df['word_2_doc_index'] == -1] #semantic_map_df["word_2_doc_index"] < 0 
        #print(reduced_topic_vectors_df.shape)
        # TODO: select the topic vectors with the correct topic level (=ranges from -1 to -n, with different number of topics per level)
        # 
        # get the topic size for every topic self.topic_model.get_topic_info().head(max_topic_number)
        topic_list = list(self.topic_model.topics.keys())
        #print(topic_list)
        topic_list.sort()
        topic_frequencies = []
        for topic in topic_list:
            topic_frequency = self.topic_model.get_topic_freq(topic)
            topic_frequencies.append(topic_frequency)
        #print(topic_frequencies)
        # get the top 20 words for each topic
        all_topic_word_lists = [self.topic_model.get_topic(topic) for topic in topic_list]
        topic_words = []
        for i, single_topic_words in enumerate(all_topic_word_lists):
            single_topic_word_list = []
            for word in single_topic_words:
                single_topic_word_list.append(word[0])
            topic_words.append(single_topic_word_list)
        #print(topic_words)
        # get the top 10 most similar documents for each topic
        topic_vectors = self.data_object.topic_vectors.values
        #print(topic_vectors)
        #print(len(topic_vectors))
        document_vectors = self.data_object.document_vectors.values
        #print(document_vectors)
        #print(len(document_vectors))
        #Compute cosine-similarities for each topic vector with each of the document vectors
        cosine_scores = util.pytorch_cos_sim(topic_vectors, document_vectors)
        #print(len(cosine_scores))
        #print(cosine_scores[0])
        # for every topic vector find the n indices of the documents that are the most similar, found here: https://stackoverflow.com/questions/16878715/how-to-find-the-index-of-n-largest-elements-in-a-list-or-np-array-python
        all_topic_n_most_similar_doc_uris = []
        for single_top2doc_similarity_scores in cosine_scores:
            single_top2doc_similarity_scores = np.array(single_top2doc_similarity_scores)
            top_n_most_similar_doc_uris = single_top2doc_similarity_scores.argsort()[-5:] # here we use top 5
            top_n_most_similar_doc_uris = top_n_most_similar_doc_uris.tolist() # make normal python list out of it to make sure we can easily JSON serialize it
            all_topic_n_most_similar_doc_uris.append(top_n_most_similar_doc_uris)
            #print(topic_scores_np_array)
            #print(top_n_most_similar_doc_uris)
        #print(all_topic_n_most_similar_doc_uris)
        # get topic names based on most frequent words in the topic
        #topic_names = []
        #for topic in topic_list:
        #    topic_name = self.topic_model.get_topic_info(topic)
        #    topic_names.append(topic_name)
        # get the topic id = LABEL => DONE
        # QUANTIZE TOPIC VECTORS: same procedure as for contextualized word vectors ...
        #reduced_topic_vectors_df = reduced_topic_vectors_df.round({'lat': 4, 'lon': 4})
        #reduced_topic_vectors_df = reduced_topic_vectors_df.groupby(['word_2_doc_index','lat','lon'])['label'].apply(list)
        #reduced_topic_vectors_df = reduced_topic_vectors_df.reset_index() 
        #reduced_topic_vectors_df.columns = ['word_2_doc_index', 'lat', 'lon', 'labels']
        reduced_topic_vectors_df['topic_frequency'] = topic_frequencies
        reduced_topic_vectors_df['topic_words'] = topic_words
        reduced_topic_vectors_df['similar_document_uris'] = all_topic_n_most_similar_doc_uris
        #print(reduced_topic_vectors_df.shape)
        #print(reduced_topic_vectors_df.head(10)) 
        # get the topic size/freq, top 50 words, top 10 most similar documents(use 
        print('DONE: topic map successfully extracted.')
        return reduced_topic_vectors_df
    
    def get_document_map(self):
        #print('... extracting document map out of the whole semantic map.')
        #semantic_map_df = self.get_semantic_map()
        # take the document_vectors out of the semantic map
        #reduced_document_vectors_df = semantic_map_df.loc[semantic_map_df['word_2_doc_index'] == -999]
        # QUANTIZE: NO quantization for document vectors, columns look like: ['word_2_doc_index', 'lat', 'lon', 'labels']
        #print('DONE: document_map successfully extracted.')
        document_map_df = self.data_object.document_map
        return document_map_df
    
    def get_paragraph_map(self):
        print('...getting paragraph map from map creator object.')
        paragraph_map_df = self.data_object.paragraph_map
        # QUANTIZATION: TODO: discuss if we need a quantization of the paragraph map
        print('DONE: paragraph map successfully extracted.')
        return paragraph_map_df
    
    def get_contextualized_word_map(self):
        contextualized_word_map_df = self.data_object.contextualized_word_map
        return contextualized_word_map_df
    
    def assign_document_vectors_to_interactive_document_corpus(self):
        print('...assigning document_vectors to documents in interactive_document_corpus')
        # get the document map
        reduced_document_vectors_df = self.get_document_map()
        reduced_document_vectors_records = reduced_document_vectors_df.to_dict('records')
        # for all the document_vectors
        for index, record in enumerate(reduced_document_vectors_records):
            document_vector_coordinates = [] # initialize the coordinates of the doc, format: [lon, lat]
            lon = record['lon']
            lat = record['lat']
            document_vector_coordinates.append(lon)
            document_vector_coordinates.append(lat)
            self.data_object.interactive_document_corpus[index].document_vector = document_vector_coordinates
        print('DONE: document_vectors successfully assigned to interactive document corpus.')
        
    def assign_paragraph_vectors_to_interactive_document_corpus(self):
        print('...assigning paragraph_vectors to documents in interactive_document_corpus')
         # get the paragraph vectors
        reduced_paragraph_vectors_df = self.get_paragraph_map()
        reduced_paragraph_vectors_records = reduced_paragraph_vectors_df.to_dict('records')
        for index, record in enumerate(reduced_paragraph_vectors_records):
            paragraph = {}
            paragraph_vector_coordinates = []
            lon = record['lon']
            lat = record['lat']
            paragraph_vector_coordinates.append(lon)
            paragraph_vector_coordinates.append(lat)
            paragraph['paragraph_vector'] = paragraph_vector_coordinates
            paragraph['paragraph_keywords'] = record['paragraph_keywords']
            document_index = self.data_object.paragraph_2_doc_index[index]
            self.data_object.interactive_document_corpus[document_index].paragraphs.append(paragraph)
        
    def write_dataframe_to_file(self, dataframe, file_path, js_compatible=False):
        print('... writing dataframe to filepath: {0}'.format(file_path))
        all_df_records = dataframe.to_dict('records') # concatenated_umap_embedding_df.astype(str).to_dict('records')
        with open(file_path, 'w') as output_file:
            if(js_compatible):
                file_name = os.path.basename(file_path)
                # remove .txt
                variable_name = file_name[:-4]
                output_file.write('export const ' + str(variable_name) + ' = ')
            json.dump(all_df_records, output_file)
        print('DONE: writing dataframe to file completed.')
        
    def write_list_to_file(self, list_to_write, file_path, js_compatible=False):
        print('... writing list to filepath: {0}'.format(file_path)) 
        with open(file_path, 'w') as output_file:
            if(js_compatible):
                file_name = os.path.basename(file_path)
                # remove .txt
                variable_name = file_name[:-4]
                output_file.write('export const ' + str(variable_name) + ' = ')
            json.dump(list_to_write, output_file)
        print('DONE: writing list to file completed.')
        
    def write_interactive_document_corpus_to_file(self, interactive_document_corpus, file_path, remove_content=True, js_compatible=False):
        print('...writing interactive document corpus to file.')
        print('remove_content: {0}'.format(remove_content))
        serialized_interactive_document_corpus = []
        for i, interactive_document in enumerate(interactive_document_corpus):
            # filter out the content (raw_text, preprocessed_text, ...), e.g. to reduce overhead in frontend 
            if(remove_content):
                interactive_document.content = {}
            serialized_interactive_document = interactive_document.dict()
            serialized_interactive_document_corpus.append(serialized_interactive_document)
            #interactive_document_corpus[i] = serialized_interactive_document
        with open(file_path, 'w') as output_file:
            if(js_compatible):
                file_name = os.path.basename(file_path)
                # remove .txt
                variable_name = file_name[:-4]
                output_file.write('export const ' + str(variable_name) + ' = ')
            json.dump(serialized_interactive_document_corpus, output_file)
        print('DONE: writing corpus objects list to file completed.')
    
    def trunc(self, values, decs=0):
        return np.trunc(values*10**decs)/(10**decs)
    
    
# TEST
class TestMapCreator(unittest.TestCase):

    @classmethod
    def setUpClass(self):
        # set all things up for the test series here
        pass

    @classmethod
    def tearDownClass(self):
        # tear everything down after testing here
        pass

    def test_class_setup_and_serialization(self):
        # given
        interactive_document_data_object = InteractiveDocument()
        interactive_document_repository_impl = InteractiveDocumentRepositoryImpl(interactive_document_data_object)
        # when
        interactive_document_repository_impl.test()
        print(interactive_document_repository_impl.data_object.dict())
        # then
        result = 6
        self.assertEqual(result, 6)

# Execution

In [12]:
# Check GPU reachability 
import torch
print(torch.__version__)
print(torch.cuda.is_available())

# unicode error, tutorial found here: https://support.prodi.gy/t/unicodeencodeerror-during-training/955/3 and https://github.com/explosion/spaCy/issues/2570
# set langugage to en us and encoding to utf-8
#import locale
#print(locale.getlocale())
#locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
#print(locale.getlocale())

1.7.1
True


### Interactive Document Corpus Creation

In [13]:
# get basic instances
preprocessing_manager_data_object = PreprocessingManager()
preprocessing_manager_repository_impl = PreprocessingManagerRepositoryImpl(preprocessing_manager_data_object)

# INTERACTIVE DOCUMENT CORPUS CREATION -------------------------------------------------
# => First Parse PDFs from the folder into .json file using allenai science parse library, --> this may take a while -> if its done assert(False) hits -> then go on
# create interactive document corpus from raw pdfs
#interactive_document_corpus = preprocessing_manager_repository_impl.parse_pdfs_to_interactive_documents(pdf_files_folder)

#assert(False)

# ALTERNATIVE: alternatively load already parsed pdfs from json
corpus_objects_list = preprocessing_manager_repository_impl.load_parsed_pdf_documents_from_json(allen_ai_parsed_output_json_file_path)
#print(len(corpus_objects_list))
#print(corpus_objects_list[50])
#assert(False)

# ALTERNATIVE
# parse the data from the json objects into interactive document objects
interactive_document_corpus = preprocessing_manager_repository_impl.parse_data_from_json_to_interactive_document_object(corpus_objects_list) #, corpus_doi_list)
#print(interactive_document_corpus[10].dict())
#assert(False)

# ALTERNATIVE: parse data from csv into interactive document objects
#interactive_document_corpus = preprocessing_manager_repository_impl.parse_data_from_csv_to_interactive_document_object(recovery_news_data_csv_file_path)
#print(interactive_document_corpus[10].dict())

#-------------------------------------------------------

# PREPROCESSING -------------------------------------------------------
# OPTIONAL: restrict the corpus size if needed
#interactive_document_corpus = interactive_document_corpus [:1000]

# preprocess the raw_text of every document
interactive_document_corpus = preprocessing_manager_repository_impl.preprocess_raw_text_in_corpus(interactive_document_corpus)
#print(interactive_document_corpus[10].content['preprocessed_text'])

# extract the keywords of every document
must_have_keyword = None #'ball' # None # select a specific keyword that MUST be in the list of keywords, e.g. if you have a specific interest in a certain word and its contexts
interactive_document_corpus = preprocessing_manager_repository_impl.create_keywords_in_corpus(interactive_document_corpus, must_have_keyword=must_have_keyword)
#print(interactive_document_corpus[10].keywords)

# write interactive document corpus to file 
# for further MAP CREATION
preprocessing_manager_repository_impl.write_interactive_document_corpus_to_file(interactive_document_corpus, interactive_document_corpus_full_file_path, remove_content=False)

preprocessing manager constructed.
PreprocessingManagerRepositoryImpl initialized.
DONE: loading parsed documents from json completed.
...parsing data from json to interactive document objects.
INFO: source 570 of json data has no raw text SECTIONS.
DONE: parse_data_from_json_to_interactive_document_object completed.
... preprocessing raw text of whole corpus.


100%|█████████████████████████████████████████| 603/603 [04:20<00:00,  2.31it/s]


DONE: preprocessing raw text of whole corpus completed.
...creating_keywords in whole corpus.


100%|█████████████████████████████████████████| 603/603 [05:18<00:00,  1.89it/s]

DONE: keywords in corpus successfully created.
...writing interactive document corpus to file.
remove_content: False
DONE: writing corpus objects list to file completed.





### Map Creation

In [14]:
# get map creator instance and data object
# get basic instances
map_creator_data_object = MapCreator()
map_creator_repository_impl = MapCreatorRepositoryImpl(map_creator_data_object)

# get interactive_document_corpus
interactive_document_corpus = map_creator_repository_impl.load_interactive_document_corpus(interactive_document_corpus_full_file_path, "load_from_file")

# map interactive_document_corpus to paragraph_corpus || build doc2par and par2doc indices
map_creator_repository_impl.create_paragraph_corpus_and_doc_2_par_indices(interactive_document_corpus, paragraph_max_length)
#print(map_creator_repository_impl.data_object.paragraph_corpus[0])
#print(map_creator_repository_impl.data_object.doc_2_paragraph_index[0])
#print(map_creator_repository_impl.data_object.paragraph_2_doc_index[0])

#assert(False)

# create topic model from paragraph corpus
# set the number of top n words for each topic to give back 
top_n_words = 20 
map_creator_repository_impl.create_topic_model(top_n_words)
#max_topic_number = 5
#info_df = map_creator_repository_impl.get_topic_info(max_topic_number)
#print(info_df)

# create and get topic embedding vectors
#map_creator_repository_impl.create_topic_vectors()
#topic_vectors_df = map_creator_repository_impl.get_topic_vectors()
#print(topic_vectors[0])
#print(len(map_creator_repository_impl.data_object.topic_labels)) 

# create document_vectors
map_creator_repository_impl.create_document_vectors()
#document_vectors = map_creator_repository_impl.get_document_vectors()
#print(document_vectors.shape)
#print(document_vectors[0])
#print(len(map_creator_repository_impl.data_object.document_vectors))

# create paragraph_vectors
map_creator_repository_impl.create_paragraph_vectors()
#paragraph_vectors = map_creator_repository_impl.get_paragraph_vectors()
#print(paragraph_vectors.shape)

# create and get contextualized word embedding vectors
map_creator_repository_impl.create_contextualized_word_vectors(use_summaries_only=True)
#contextualized_word_vectors = map_creator_repository_impl.get_contextualized_word_vectors()
#print(contextualized_word_vectors.shape)
#print(contextualized_word_vectors[0])
#print(len(map_creator_repository_impl.data_object.contextualized_word_labels))
#print(len(map_creator_repository_impl.data_object.contextualized_word_2_doc_index))

# SANITY CHECK: plot points on 3D sphere or 2D projection of sphere for checking if everything worked out
# plot the semantic map on 3d sphere, 2d sphere projection and 2d plane for VERIFICATION
#topic_vectors_df = map_creator_repository_impl.get_topic_vectors()
#print(topic_vectors_df.shape)
#contextualized_word_vectors_df = map_creator_repository_impl.get_contextualized_word_vectors()
#print(contextualized_word_vectors_df.shape)
#document_vectors_df = map_creator_repository_impl.get_document_vectors()
#print(document_vectors_df.shape)
#paragraph_vectors_df = map_creator_repository_impl.get_paragraph_vectors()
#print(paragraph_vectors_df.shape)
#semantic_map_df = map_creator_repository_impl.get_semantic_map()
#print(map_creator_repository_impl.data_object.semantic_map.shape)
#map_creator_repository_impl.plot_dataframe(paragraph_vectors_df)

# create document map
map_creator_repository_impl.create_document_map()
#document_map = map_creator_repository_impl.get_document_map()
#print(document_map.head())

# create paragraph map
map_creator_repository_impl.create_paragraph_map()
#paragraph_map = map_creator_repository_impl.get_paragraph_map()
#print(paragraph_map.head())
#map_creator_repository_impl.write_dataframe_to_file(paragraph_map, interactive_document_corpus_paragraph_map_points_file_path)

# create contextualized word map
map_creator_repository_impl.create_contextualized_word_map()
#contextualized_word_map = map_creator_repository_impl.get_contextualized_word_map()
#print(contextualized_word_map.head())

#assert(False) # break point

# get topic map and write topic map to file
#topic_level = -1
#topic_map = map_creator_repository_impl.get_topic_map(topic_level)
#map_creator_repository_impl.write_dataframe_to_file(topic_map, interactive_document_corpus_topic_map_points_file_path)

# assign the document vectors to the interactive document corpus
#document_map = map_creator_repository_impl.get_document_map()
#print(document_map.head())
map_creator_repository_impl.assign_document_vectors_to_interactive_document_corpus()
map_creator_repository_impl.assign_paragraph_vectors_to_interactive_document_corpus()
#interactive_document_corpus = map_creator_repository_impl.data_object.interactive_document_corpus
#print(interactive_document_corpus[0])

# get topic corpus and write topic corpus to file
topic_corpus = map_creator_repository_impl.get_topic_corpus()
map_creator_repository_impl.write_list_to_file(topic_corpus, interactive_document_corpus_topic_corpus_file_path, js_compatible=True)

# get quantized corpus and write corpus to file 
number_of_decimals = 0 # 0, 1, 2 works well -> depends on how many documents you want to map. 
corpus_points = map_creator_repository_impl.get_quantized_corpus(number_of_decimals)
map_creator_repository_impl.write_dataframe_to_file(corpus_points, interactive_document_corpus_corpus_points_file_path, js_compatible=True)

# get quantized base map (topic clusters already assigned in quantization process) and write to file
number_of_decimals = 0# 0, 1, 2 works well -> depends on how many documents you want to map. 
base_map = map_creator_repository_impl.get_quantized_base_map(number_of_decimals)
#print(base_map.head(25))
#print(base_map.tail(25))
map_creator_repository_impl.write_dataframe_to_file(base_map, interactive_document_corpus_base_map_points_file_path, js_compatible=True)

# write interactive_document_corpus to file for final usage in frontend
# for FRONTEND USAGE
interactive_document_corpus = map_creator_repository_impl.data_object.interactive_document_corpus
map_creator_repository_impl.write_interactive_document_corpus_to_file(interactive_document_corpus, interactive_document_corpus_file_path, remove_content=True, js_compatible=True) 



Textprocessor: SpaCy initialized.
Sentence Transformer Max Sequence Length:  512
Sentence Transformer Tokenizer:  PreTrainedTokenizerFast(name_or_path='/mnt/local/2022_EMNLP_KeywordScape_Visual_Document_Exploration_using_Contextualized_Word_Embeddings/data/allenai_specter/', vocab_size=31116, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
Sentence Transformer: allenai-specter initialized.
MapCreatorRepositoryImpl initialized.
... loading interactive_document_corpus from: /mnt/local/2022_EMNLP_KeywordScape_Visual_Document_Exploration_using_Contextualized_Word_Embeddings/datasets/emnlp_2021_full_papers_dataset/emnlp_corpus_full.txt
loading interactive_document_corpus completed.
... creating paragraph_corpus and doc_2_par indices.
DONE: paragraph corpus and doc_2_par indices successfully created.
... creating topic model.
hugg

100%|██████████████████████████████████| 9392/9392 [00:00<00:00, 2275336.64it/s]


DONE: topic model successfully created.
...creating document vectors for corpus.


100%|████████████████████████████████████| 603/603 [00:00<00:00, 1016137.13it/s]


DONE: document_vectors created successfully.
...creating paragraph vectors for corpus.
DONE: document_vectors created successfully.
... creating contextualized word vectors.


100%|█████████████████████████████████████████| 603/603 [00:27<00:00, 22.20it/s]


DONE: contextualized word vectors successfully created.
... creating document map
UMAP: dim reduction using HAVERSINE output metric.
DONE: document map successfully created.
... creating paragraph map.
UMAP: dim reduction using HAVERSINE output metric.
DONE: paragraph map successfully created.
...creating contextualized_word_map.
UMAP: dim reduction using HAVERSINE output metric.
DONE: contextualized word map successfully created.
...assigning document_vectors to documents in interactive_document_corpus
DONE: document_vectors successfully assigned to interactive document corpus.
...assigning paragraph_vectors to documents in interactive_document_corpus
...getting paragraph map from map creator object.
DONE: paragraph map successfully extracted.
... writing list to filepath: /mnt/local/2022_EMNLP_KeywordScape_Visual_Document_Exploration_using_Contextualized_Word_Embeddings/datasets/emnlp_2021_full_papers_dataset/emnlp_corpus_topic_corpus.txt
DONE: writing list to file completed.
.. quan