# Publication pre-processing Helpers
This notebook provides some helpers that can aid in the pre-processing of publications that should be added to sci-graph. Use the provided functions and classes in a way that is fitting your data and usecase.

In [None]:
import json
import requests
import sys

## Utilities
These methods are used to get utility functionalities, like showing a progressbar.

In [None]:
def show_progress(current_item, total_items, prefix="", size=60, out=sys.stdout):
    x = int(size*current_item/total_items)
    print(f"{prefix}[{u'█'*x}{('.'*(size-x))}] {current_item}/{total_items}", end='\r', file=out, flush=True)

## File Handling
These helpers help you to load and save publications as json files.

In [None]:
def load_file_to_json(filepath: str) -> dict:
    with open(filepath) as input_file:
        return json.load(input_file)

def save_object_to_json_file(filepath: str, object_to_save: any):
    json_object = json.dumps(object_to_save)
    with open(filepath, 'w+') as output_file:
        output_file.write(json_object)

## Clean up
These functions help in cleaning up data associated with publications.

In [None]:
# clean keywords by separating keywords that are separated by semicolons and remove keywords 
# that have non-alphanumeric characters and keywords that contain more than 3 characters that are digits
def clean_up_keywords(publication):
    for keyword in publication['keywords']:
        if '; ' in keyword:
            publication['keywords'].remove(keyword)
            for kw in keyword.split('; '):
                publication['keywords'].append(kw)
            clean_up_keywords(publication)
            continue
        if (not bool(re.match('^[a-zA-Z0-9-_\säöüÄÖÜß\']+$', keyword))) or (sum(c.isdigit() for c in keyword) > 3):
            print('Not alphanumeric: ' + keyword)
            publication['keywords'].remove(keyword)

## Translation
These functions can be used to translate keywords using the [deepl api](https://www.deepl.com/docs-api). To reduce api-requests a set of dictionaries are used to cache already fetched results.

In [None]:
# Install dependency
!pip install deepl

In [None]:
import deepl

class KeywordTransformer:
    
    def __init__(self):
        self.cached_translations = 0
        self.conducted_translations = 0
    
    '''
    publication: a dictionary of the form:
        {
        ...
        'language': 'en',
        'keywords': ['keyword1', 'keyword2', ...],
        ...
        }
    en_de_dict: dictonary that maps english keywords to german translations
    de_en_dict: dictonary that maps german keywords to english translations
    translator: a deepl translator object, instanciated by calling deepl.Translator(authorization_key)
    '''
    def transform_and_translate_keywords(self, publication, en_de_dict, de_en_dict, translator):
        transformed_keywords = []
        for keyword in publication['keywords']:
            keyword = keyword.lower()
            if len(keyword) <= 1:
                continue
            # check for duplicate keywords before translating
            if any([any([value['value'] == keyword for value in transformed_keyword['values']]) for transformed_keyword in transformed_keywords]):
                continue
            if publication['language'] == 'en':
                keyword_en, keyword_de, en_de_dict, de_en_dict = self.translate_english_text(keyword, en_de_dict, de_en_dict, translator)
            else:
                keyword_en, keyword_de, en_de_dict, de_en_dict = self.translate_german_text(keyword, en_de_dict, de_en_dict, translator)
            transformed_keywords.append({
                'values': [{
                    'value': keyword_en,
                    'language': 'en'
                },
                {
                    'value': keyword_de,
                    'language': 'de'
                }]
            })
        return transformed_keywords, en_de_dict, de_en_dict


    def translate_english_text(self, text, en_de_dict, de_en_dict, translator):
        try:
            translation = en_de_dict[text]
            self.cached_translations = self.cached_translations + 1
            return text, translation, en_de_dict, de_en_dict
        except KeyError:
            self.conducted_translations = self.conducted_translations + 1
            translation_result = translator.translate_text(text, target_lang='DE')
            if translation_result.detected_source_lang == 'DE':
                return self.translate_german_text(text, en_de_dict, de_en_dict, translator)
            try:
                en_de_dict[text] = translation_result.text.lower()
                de_en_dict[translation_result.text.lower()] = text
            except:
                print(f'Failed to insert {text}:{translation_result.text.lower()} into the dictionary.')
            return text, translation_result.text.lower(), en_de_dict, de_en_dict

    def translate_german_text(self, text, en_de_dict, de_en_dict, translator):
        try:
            translation = de_en_dict[text]
            self.cached_translations = self.cached_translations + 1
            return translation, text, en_de_dict, de_en_dict
        except KeyError:
            self.conducted_translations = self.conducted_translations + 1
            translation_result = translator.translate_text(text, target_lang='EN-US')
            if translation_result.detected_source_lang == 'EN':
                return self.translate_english_text(text, en_de_dict, de_en_dict, translator)
            try:
                de_en_dict[text] = translation_result.text.lower()
                en_de_dict[translation_result.text.lower()] = text
            except:
                print(f'Failed to insert {translation_result.text.lower()}:{text} into the dictionary.')
            return translation_result.text.lower(), text, en_de_dict, de_en_dict