In [3]:
from transformers import MarianMTModel, MarianTokenizer, pipeline, MBartForConditionalGeneration, MBart50TokenizerFast
import gradio as gr
from PIL import Image
import pytesseract
import math
from google.colab import drive
from google.colab import files

# Install necessary libraries if not already installed
!pip install -q transformers gradio pytesseract
!sudo apt update -q
!sudo apt install -q tesseract-ocr

# --- Model Loading ---
# Load translation model and tokenizer
translator_model_name = "facebook/mbart-large-50-many-to-many-mmt"
try:
    translator_tokenizer = MBart50TokenizerFast.from_pretrained(translator_model_name)
except Exception as e:
    print(f"Error loading MBart tokenizer: {e}")
    # Fallback or exit if tokenizer can't be loaded

try:
    translator_model = MBartForConditionalGeneration.from_pretrained(translator_model_name)
except Exception as e:
    print(f"Error loading MBart model: {e}")
    # Fallback or exit if model can't be loaded

# Load summarization pipeline
summarizer_model_name = "t5-base"
try:
    summarizer = pipeline("summarization", model=summarizer_model_name, tokenizer=summarizer_model_name)
except Exception as e:
    print(f"Error loading summarization model {summarizer_model_name}: {e}")
    summarizer = None # Handle case where summarizer fails to load


# --- OCR Function ---
def perform_ocr_on_image(image):
    """Performs OCR on a PIL Image and returns the extracted text."""
    if image is None:
        return ""
    try:
        text = pytesseract.image_to_string(image)
        return text.strip()
    except Exception as e:
        print(f"Error during OCR: {e}")
        return ""


# --- Translation Function ---
def translate_text(text, source_language_code, target_language_code, translator_tokenizer, translator_model):
    """Translates text to the target language using MBart model."""
    if not text:
        return ""
    try:
        translator_tokenizer.src_lang = source_language_code
        encoded = translator_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=1024)

        generated_tokens = translator_model.generate(
            **encoded,
            forced_bos_token_id=translator_tokenizer.lang_code_to_id[target_language_code],
            max_length=512,
            num_beams=4,
            early_stopping=True
        )
        translated_text = translator_tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
        return translated_text
    except Exception as e:
        print(f"Error during translation: {e}")
        return f"Translation failed: {e}"


# --- Summarization Function ---
def summarize_text(text, summarizer):
    """Summarizes the input text."""
    if not text or summarizer is None:
        return "Summarization model not loaded or no text to summarize."
    try:
        # Clean up unwanted characters that might appear after translation
        cleaned_text = text.replace('!', '').replace(',', '').replace('.', '').replace(';', '').replace(':', '')
        cleaned_text = cleaned_text.replace('(', '').replace(')', '').replace('[', '').replace(']', '')
        cleaned_text = cleaned_text.replace('*', '').replace("'", '').replace('"', '')
        cleaned_text = cleaned_text.strip() # Remove leading/trailing whitespace

        if not cleaned_text:
             return "Translated text is empty or only contains punctuation after cleaning."

        max_summarizer_input_length = 1024
        input_text_for_summarizer = cleaned_text[:max_summarizer_input_length]

        # You can adjust parameters like max_length and min_length for summarization
        summary = summarizer(input_text_for_summarizer, max_length=200, min_length=30, do_sample=False, truncation=True)
        return summary[0]['summary_text']
    except Exception as e:
        print(f"Error during summarization: {e}")
        return f"Summarization failed: {e}"

# Update the call to summarize_text in the translate_and_summarize function
def translate_and_summarize(article_text, article_image, source_language, target_language, translator_tokenizer, translator_model, summarizer):
    """Translates and then summarizes the article text, optionally from an image."""
    processed_text = ""
    if article_text:
        processed_text = article_text
    elif article_image:
        processed_text = perform_ocr_on_image(article_image)
        if not processed_text:
            print("OCR failed to extract text from image.")
            return "Could not extract text from the image.", "Summarization requires extracted text."
    else:
        return "Please provide article text or an image.", "Summarization requires input."

    # Map dropdown language names to MBart language codes
    lang_codes = {
        "English": "en_XX",
        "French": "fr_XX",
        "Spanish": "es_XX",
        "German": "de_DE",
        "Italian": "it_IT",
        "Russian": "ru_RU",
        "Dutch": "nl_XX"
    }

    source_lang_code = lang_codes.get(source_language)
    target_lang_code = lang_codes.get(target_language)

    if not source_lang_code or not target_lang_code:
        print(f"Invalid source ({source_language}) or target ({target_language}) language selected.")
        return "Invalid source or target language selected.", "Cannot summarize if translation failed."

    print(f"Processing text from {source_language} ({source_lang_code}) to {target_language} ({target_lang_code})")
    print(f"Original Text (first 100 chars): {processed_text[:100]}...")

    translated_article = translate_text(processed_text, source_lang_code, target_lang_code, translator_tokenizer, translator_model)

    print("Translated Text for Summarization (first 100 chars):")
    print(translated_article[:100])
    if "Translation failed" in translated_article:
        return translated_article, "Cannot summarize due to translation error."

    # Call summarize_text with the translated article and summarizer
    summarized_article = summarize_text(translated_article, summarizer)
    print("Summarized Text (first 100 chars):")
    print(summarized_article[:100])
    if "Summarization failed" in summarized_article or "Summarization model not loaded" in summarized_article:
        return translated_article, summarized_article

    return translated_article, summarized_article

# The rest of the code remains the same, just update the summarize_text function and the call within translate_and_summarize

# --- User Interface using Gradio ---

# Define input components
article_input = gr.Textbox(label="Enter Article Text")
article_image_input = gr.Image(type="pil", label="Upload Article Image (OCR Enabled)")
language_choices = ["English", "French", "Spanish", "German", "Italian", "Russian", "Dutch"]
source_lang_input = gr.Dropdown(
    label="Source Language (Ensure model supports)",
    choices=language_choices,
    value="English"
)
target_lang_input = gr.Dropdown(
    label="Target Language (Ensure model supports)",
    choices=language_choices,
    value="English"
)

# Define output components
translated_output = gr.Textbox(label="Translated Article")
summarized_output = gr.Textbox(label="Summarized Article")

# Create the Gradio interface, passing models and tokenizers as arguments
iface = gr.Interface(
    fn=lambda text, img, src_lang, tgt_lang: translate_and_summarize(
        text, img, src_lang, tgt_lang, translator_tokenizer, translator_model, summarizer
    ),
    inputs=[article_input, article_image_input, source_lang_input, target_lang_input],
    outputs=[translated_output, summarized_output],
    title="News Translation and Summarization",
    description="Enter article text or upload an image for translation and summarization. Supports multiple languages.",
    analytics_enabled=False
)

# --- Main Execution Block ---
if __name__ == "__main__":
    drive.mount('/content/drive', force_remount=True) # Mount drive
    !pwd # Print current directory
    iface.launch(share=True)


Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists...
Building dependency tree...
Reading state information...
35 packages can be upgraded. Run 'apt list --upgradable' to see them.
[1;33mW: [0mSkipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelea

Device set to use cpu


Mounted at /content/drive
/content
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d164cda647f4b196ac.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
