# Week 3 Exercise: Multi-Language Text Analyzer
## By Mougang Thomas Gasmyr from the Wakanda Team

A multi-language text analysis tool that combines **HuggingFace pipelines**, a **locally quantized LLM**,
**streaming generation**, and **tokenizer exploration** into a unified Gradio application.

### Features
- **Language Detection** -- automatically identifies the input language using a HuggingFace pipeline
- **Translation** -- translates text to a target language using a local quantized Llama 3.2 3B
- **Summarization** -- generates a concise summary with streaming output via TextIteratorStreamer
- **Sentiment Analysis** -- extracts sentiment using a multilingual BERT pipeline
- **Tokenizer Explorer** -- compare how different tokenizers handle multilingual text

## Setup

This notebook is designed to run on **Google Colab with a T4 GPU** (free tier).

### Required Colab Secrets
- `HF_TOKEN` -- Your HuggingFace access token (needed for gated Llama model)

### GPU Check
Make sure you have selected **Runtime > Change runtime type > T4 GPU** before running.

In [None]:
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate gradio

In [None]:
# imports
import torch
import threading
import gradio as gr
from huggingface_hub import login
from google.colab import userdata
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TextIteratorStreamer,
    BitsAndBytesConfig,
    pipeline,
)

In [None]:
# Constants

LLAMA = "meta-llama/Llama-3.2-3B-Instruct"
LANG_DETECT_MODEL = "papluca/xlm-roberta-base-language-detection"
SENTIMENT_MODEL = "nlptown/bert-base-multilingual-uncased-sentiment"
TARGET_LANGUAGES = ["English", "French", "Spanish", "German"]

TOKENIZER_MODELS = {
    "Llama 3.2": "meta-llama/Llama-3.2-3B-Instruct",
    "BERT Multilingual": "bert-base-multilingual-cased",
    "GPT-2": "gpt2",
}

SAMPLE_TEXTS = {
    "English": "The quick brown fox jumps over the lazy dog.",
    "French": "Le renard brun rapide saute par-dessus le chien paresseux.",
    "Spanish": "El r\u00e1pido zorro marr\u00f3n salta sobre el perro perezoso.",
    "German": "Der schnelle braune Fuchs springt \u00fcber den faulen Hund.",
}

In [None]:
# Sign in to HuggingFace Hub

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [None]:
# Verify GPU availability

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
    print(f"GPU available: {gpu_name} ({gpu_memory:.1f} GB)")
else:
    print("WARNING: No GPU detected. Model loading and inference will fail.")
    print("On Colab: Runtime > Change runtime type > T4 GPU")

In [None]:
# Load lightweight HuggingFace pipelines for language detection and sentiment

print("Loading language detection pipeline...")
lang_detector = pipeline(
    "text-classification",
    model=LANG_DETECT_MODEL,
    device=0,
    top_k=3,
)

print("Loading sentiment analysis pipeline...")
sentiment_analyzer = pipeline(
    "sentiment-analysis",
    model=SENTIMENT_MODEL,
    device=0,
)

print("Pipelines loaded successfully!")

In [None]:
# Load Llama 3.2 3B Instruct with 4-bit quantization

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
)

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token = tokenizer.eos_token

print("Loading quantized Llama 3.2 3B model...")
model = AutoModelForCausalLM.from_pretrained(
    LLAMA,
    device_map="auto",
    quantization_config=quant_config,
)

print("Llama model loaded successfully!")

In [None]:
# Core analysis functions

LANG_NAME_TO_CODE = {
    "english": "en", "french": "fr", "spanish": "es", "german": "de",
}


def detect_language(text):
    """Detect the language of the input text using HF pipeline."""
    results = lang_detector(text[:512])
    if isinstance(results[0], list):
        results = results[0]
    top_result = results[0]
    lang_code = top_result["label"]
    confidence = top_result["score"]

    predictions_text = "\n".join(
        f"  - **{r['label']}**: {r['score']:.1%}" for r in results
    )
    return lang_code, confidence, predictions_text


def analyze_sentiment(text):
    """Analyze sentiment using multilingual BERT pipeline."""
    result = sentiment_analyzer(text[:512])[0]
    label = result["label"]
    score = result["score"]

    star_count = int(label.split()[0])
    sentiment_map = {
        1: "Very Negative",
        2: "Negative",
        3: "Neutral",
        4: "Positive",
        5: "Very Positive",
    }
    sentiment_label = sentiment_map.get(star_count, label)
    return sentiment_label, star_count, score


def build_llama_messages(system_content, user_content):
    """Build chat messages and tokenize using apply_chat_template."""
    messages = [
        {"role": "system", "content": system_content},
        {"role": "user", "content": user_content},
    ]
    input_ids = tokenizer.apply_chat_template(
        messages, return_tensors="pt", add_generation_prompt=True
    )
    if hasattr(input_ids, "input_ids"):
        input_ids = input_ids["input_ids"]
    input_ids = input_ids.to("cuda")
    attention_mask = torch.ones_like(input_ids)
    return input_ids, attention_mask


def translate_text(text, source_lang, target_lang):
    """Translate text using the local Llama model (non-streaming)."""
    system_message = (
        f"You are a professional translator. Translate the following text from "
        f"{source_lang} to {target_lang}. Output ONLY the translation, nothing else."
    )
    input_ids, attention_mask = build_llama_messages(system_message, text)
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_new_tokens=1024,
        pad_token_id=tokenizer.eos_token_id,
    )
    response = tokenizer.decode(
        outputs[0][input_ids.shape[-1]:], skip_special_tokens=True
    )
    return response.strip()


def summarize_text_streaming(text, source_lang):
    """Summarize text using Llama with streaming via TextIteratorStreamer."""
    system_message = (
        "You are a skilled summarizer. Provide a concise summary of the following text. "
        "The summary should capture the key points in 2-4 sentences. "
        "Write the summary in English."
    )
    user_content = (
        f"The following text is in {source_lang}. Please summarize it in English:\n\n{text}"
    )
    input_ids, attention_mask = build_llama_messages(system_message, user_content)

    streamer = TextIteratorStreamer(
        tokenizer, skip_prompt=True, skip_special_tokens=True
    )

    thread = threading.Thread(
        target=model.generate,
        kwargs={
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "max_new_tokens": 512,
            "pad_token_id": tokenizer.eos_token_id,
            "streamer": streamer,
        },
    )
    thread.start()

    for new_text in streamer:
        yield new_text

    thread.join()

In [None]:
# Main analysis orchestrator -- connects to Gradio UI

def analyze_text(text, target_language):
    """Full analysis pipeline with progressive Gradio output."""
    if not text or not text.strip():
        yield "", "", "", "", ""
        return

    # Step 1: Language Detection (HF pipeline)
    lang_code, confidence, predictions_text = detect_language(text)
    lang_display = (
        f"**Detected:** {lang_code} ({confidence:.1%})\n\n"
        f"**Top predictions:**\n{predictions_text}"
    )
    yield lang_display, "Translating...", "", "", ""

    # Step 2: Translation (Llama)
    target_code = LANG_NAME_TO_CODE.get(target_language.lower(), "en")
    needs_translation = lang_code.lower() != target_code
    if needs_translation:
        translated = translate_text(text, lang_code, target_language)
    else:
        translated = text
    yield lang_display, translated, "Generating summary...", "", ""

    # Step 3: Summarization with streaming (Llama + TextIteratorStreamer)
    summary_so_far = ""
    for chunk in summarize_text_streaming(text, lang_code):
        summary_so_far += chunk
        yield lang_display, translated, summary_so_far, "", ""

    # Step 4: Sentiment Analysis (HF pipeline)
    sentiment_label, star_count, sentiment_score = analyze_sentiment(text)
    stars = "\u2605" * star_count + "\u2606" * (5 - star_count)
    sentiment_display = (
        f"**Sentiment:** {sentiment_label}\n\n"
        f"**Rating:** {stars} ({star_count}/5)\n\n"
        f"**Confidence:** {sentiment_score:.1%}"
    )
    yield lang_display, translated, summary_so_far, sentiment_display, "Analysis complete!"

In [None]:
# Tokenizer Explorer -- compare how different tokenizers handle multilingual text

def explore_tokenizers(text):
    """Compare how different tokenizers process the same text."""
    if not text or not text.strip():
        return "Please enter some text to analyze."

    results = [f"## Tokenizer Comparison\n> {text}\n"]

    for name, model_id in TOKENIZER_MODELS.items():
        try:
            tok = AutoTokenizer.from_pretrained(model_id)
            token_ids = tok.encode(text)
            tokens = tok.convert_ids_to_tokens(token_ids)
            decoded_tokens = [tok.decode([tid]) for tid in token_ids]

            results.append(f"### {name} (`{model_id}`)")
            results.append(f"- **Vocabulary size:** {tok.vocab_size:,}")
            results.append(f"- **Token count:** {len(token_ids)}")
            results.append(f"- **Tokens:** `{tokens}`")
            results.append(f"- **Decoded:** {' | '.join(decoded_tokens)}")
            results.append("")
        except Exception as e:
            results.append(f"### {name}\n- Error loading tokenizer: {e}\n")

    # Efficiency comparison table
    results.append("### Efficiency Comparison")
    results.append("| Tokenizer | Token Count | Vocab Size |")
    results.append("|-----------|------------|------------|")
    for name, model_id in TOKENIZER_MODELS.items():
        try:
            tok = AutoTokenizer.from_pretrained(model_id)
            count = len(tok.encode(text))
            results.append(f"| {name} | {count} | {tok.vocab_size:,} |")
        except:
            pass

    return "\n".join(results)

In [None]:
# Gradio Blocks UI

with gr.Blocks(title="Multi-Language Text Analyzer") as analyzer_app:
    gr.Markdown(
        "# Multi-Language Text Analyzer\n"
        "Analyze text in any language: detect the language, translate it, "
        "summarize it with streaming, and extract sentiment.\n\n"
        "*Powered by HuggingFace pipelines + locally quantized Llama 3.2 3B*"
    )

    with gr.Tabs():
        # Tab 1: Text Analyzer
        with gr.TabItem("Text Analyzer"):
            with gr.Row():
                with gr.Column(scale=2):
                    text_input = gr.Textbox(
                        label="Input Text (any language)",
                        placeholder="Paste or type text in any language...",
                        lines=8,
                    )
                    target_lang = gr.Dropdown(
                        choices=TARGET_LANGUAGES,
                        value="English",
                        label="Translate to",
                    )
                    analyze_btn = gr.Button("Analyze", variant="primary")

                with gr.Column(scale=3):
                    lang_output = gr.Markdown(label="Language Detection")
                    translation_output = gr.Markdown(label="Translation")
                    summary_output = gr.Markdown(label="Summary (Streaming)")
                    sentiment_output = gr.Markdown(label="Sentiment Analysis")
                    status_output = gr.Markdown(label="Status")

            gr.Examples(
                examples=[
                    ["Bonjour le monde! Le Cameroon c'est un pays africain avec une population de 24 millions d'habitants.", "English"],
                    ["Die K\u00fcnstliche Intelligenz ver\u00e4ndert unsere Welt in einer Geschwindigkeit, die wir uns kaum vorstellen k\u00f6nnen. Sie beeinflusst Medizin, Bildung und Wirtschaft.", "English"],
                    ["La inteligencia artificial est\u00e1 transformando todos los aspectos de nuestra vida cotidiana, desde la salud hasta la educaci\u00f3n.", "French"],
                ],
                inputs=[text_input, target_lang],
            )

            analyze_btn.click(
                fn=analyze_text,
                inputs=[text_input, target_lang],
                outputs=[lang_output, translation_output, summary_output, sentiment_output, status_output],
            )

        # Tab 2: Tokenizer Explorer
        with gr.TabItem("Tokenizer Explorer"):
            gr.Markdown(
                "## Compare Tokenizers Across Languages\n\n"
                "Notice how token counts vary across languages!"
            )
            with gr.Row():
                with gr.Column():
                    tok_input = gr.Textbox(
                        label="Text to tokenize",
                        placeholder="Enter text in any language...",
                        lines=3,
                    )
                    tok_btn = gr.Button("Compare Tokenizers", variant="primary")

                    gr.Examples(
                        examples=[[v] for v in SAMPLE_TEXTS.values()],
                        inputs=[tok_input],
                    )

                with gr.Column():
                    tok_output = gr.Markdown(label="Tokenizer Comparison")

            tok_btn.click(
                fn=explore_tokenizers,
                inputs=[tok_input],
                outputs=[tok_output],
            )

print("UI built. Ready to launch.")

In [None]:
analyzer_app.launch(share=True, debug=True)