# Imports
Please note that some imports are somewhat hacky to get to work and require some tinkering.

In [51]:
from transformers import pipeline # Handles summarization
import requests # Handles translation using the DeepL API
from typing import Optional

# Global variables and constants
The DeepL API key is used for the translation API. If the API key becomes invalid for some reason, you can generate your own API key following the instructions at https://www.deepl.com/docs-api.

In [52]:
# At this time of writing this (20.05.2023), the API had 450k characters still unused. Please keep this in mind and use the provided key responsibly.
DEEPL_API_KEY = '898523e2-0911-71ea-8d45-3e60991d2130:fx'
DEEPL_BASE_URL = 'https://api-free.deepl.com'

# A short description of the summarization logic
Originally, the plan was to simply summarize the provided text natively in the language it was provided. There are plenty of examples of this available, such as the open-source Reddit bot "autotldr", which has a similar function.

Problems rose when it was determined that lots of slideshows have only bullet points, which isn't compatible with the style other similar projects use. Other projects use a pattern, where they extract the important sentences from the provided text without editing it. This falls apart with ours.

To bypass this problem, the `SummarizerPipeline` from the huggingface `transformers` library is used. By translating the source text to English and then summarizing it, we can bypass many of the issues that arise from the traditional summarization methods. This also makes it trivial to add additional languages, in fact by default all DeepL supported languages should theoretically be able to be summarized properly. Keep in mind that this is untested functionality and no guarantees are provided.

In [53]:
def translate_text(text: str, target_lang: str ='EN-GB', source_lang: Optional[str] = None) -> tuple[str, str]:
    """This function returns a tuple of (source_lang, translated_text)."""
    # Build the URL for the translation service
    url = f"{DEEPL_BASE_URL}/v2/translate"
    # Build the payload
    payload = { 'text': [text], 'target_lang': target_lang }
    # In case a manual source language is set, we should pass it along. Otherwise, DeepL will handle it for us
    if source_lang is not None:
        payload[source_lang] = source_lang
    # Headers
    headers = { 'Authorization': f"DeepL-Auth-Key {DEEPL_API_KEY}" }
    # Send the request
    response = requests.post(url, json=payload, headers=headers)
    json_response = response.json()
    # See the DeepL docs for the exact JSON format
    return json_response['translations'][0]['detected_source_language'], json_response['translations'][0]['text']

In [54]:
def summarize_text(text: str, language: Optional[str] = None) -> str:
    """This functions returns a summary of the provided text. If the source language is known, pass it in the `language`
        argument for a more accurate translation."""
    # Get the translated text with its corresponding language
    source_lang, translated_text = translate_text(text, source_lang=language)
    # Get the amount of tokens in the new text. The max length will be 50% of that.
    # This is a simple solution because we don't need to be 100% accurate.
    token_count = len(translated_text.split(" "))
    half_token_count = token_count // 2
    # Summarize the text
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    summarized_text_en = summarizer(translated_text, min_length=10, max_length=half_token_count)[0]["summary_text"]
    # Get back the original language
    _, returnable_text = translate_text(summarized_text_en, target_lang=source_lang, source_lang='EN')
    return returnable_text
