# this is to compare same huggingface pipeline task with differnt models
## list of tasks
- sentiment-analysis
- ner
- question-answering
- summarization
- audio-classification
- automatic-speech-recognition
- image-classification
- object-detection

In [None]:
# to install those depdencies once only
#%pip install --upgrade transformers accelerate torch torchvision torchaudio librosa pillow

# to pre download the models from https://hf-mirror.com once only
# - 0. install wget, jq for your git-bash
# - 1. locate to the the preparation folder, e.g: C:\data\github\aiapp\preparation
# - 2. change the download_to_dir in hfd_models.sh if default value (~/hfd) is not suitable for you.
# - 3. start git-bash from this preparation folder
# - 4. run this: export HF_ENDPOINT="https://hf-mirror.com" && sh hfd_models_list.sh

from transformers import pipeline
import time
import os
from PIL import Image
import requests
from io import BytesIO

# switch to different HF cfgs (hf-mirror for faster download)
print(f"env for: HF_ENDPOINT={os.environ.get('HF_ENDPOINT', 'not set')}")
#os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
#os.environ["HF_HOME"] = "/hfd"
#os.environ['HF_HUB_CACHE'] = "/hfd/hub"

# allow to retest with local_model_path_dir commented w/o restart kernel
if 'local_model_path_dir' in globals() or 'local_model_path_dir' in locals():
    del local_model_path_dir

# uncomment and set correct path to use manually pre-dwonload models.
#local_model_path_dir=r"C:\Users\bill\hfd"

# start the jupyterlab with this project root dir: --notebook-dir=C:/data/github/aiapp or can set the PROJECT_HOME with this project root dir as below:
#os.environ["PROJECT_HOME"] = "C:/data/github/aiapp"
print(f"env for: PROJECT_HOME={os.environ.get('PROJECT_HOME', 'not set')}")
data_dir = os.path.join(os.environ.get("PROJECT_HOME", "."), "homeworks/week1/transformers/data")
print(f"data_dir={data_dir}")

# --------------------------
# 1. Define Tasks & Models
# --------------------------
tasks = {
    "sentiment-analysis": {
        "models": [
            "tabularisai/multilingual-sentiment-analysis",
            "ProsusAI/finbert",
            "finiteautomata/bertweet-base-sentiment-analysis",
            "boltuix/bert-emotion",
            "cardiffnlp/twitter-roberta-base-sentiment-latest",
            "distilbert/distilbert-base-uncased-finetuned-sst-2-english",
        ],
        "inputs": [
            "I love this product!",
            "This is the worst experience ever.",
            "The weather is okay, I guess."
        ]
    },
    "ner": {
        "models": [
            "dbmdz/bert-large-cased-finetuned-conll03-english",
            "Jean-Baptiste/roberta-large-ner-english"
        ],
        "inputs": [
            "Barack Obama was born in Hawaii and worked in Washington D.C.",
            "Apple is headquartered in Cupertino, California."
        ]
    },
    "question-answering": {
        "models": [
            "deepset/tinyroberta-squad2",
            "distilbert/distilbert-base-cased-distilled-squad",
        ],
        "inputs": [
            {
                "question": "Where was Barack Obama born?",
                "context": "Barack Obama was born in Hawaii."
            },
            {
                "question": "What is the capital of France?",
                "context": "France's capital is Paris."
            }
        ]
    },
    "summarization": {
        "models": [
            "Falconsai/text_summarization",
            "google-t5/t5-base",
        ],
        "inputs": [
            "The Hubble Space Telescope is a large telescope in space. It was launched in 1990 and has taken stunning images of distant galaxies. Scientists use it to study the universe.",
            "Renewable energy sources like solar and wind are becoming more popular. They reduce reliance on fossil fuels and lower carbon emissions."
        ]
    },
    "audio-classification": {
        "models": [
            "MIT/ast-finetuned-audioset-10-10-0.4593",
            "superb/hubert-base-superb-er",
        ],
        "inputs": [data_dir + "/audio/mlk.flac"] 
    },
    "automatic-speech-recognition": {
        "models": [
            "openai/whisper-tiny",
            "openai/whisper-small",
        ],
        "inputs": [data_dir + "/audio/mlk.flac"] 
    },
    "image-classification": {
        "models": [
            "Falconsai/nsfw_image_detection",
            "google/vit-base-patch16-224",
        ],
        "inputs": [data_dir + "/image/cat_dog.jpg"] 
    },
    "object-detection": {
        "models": [
            "facebook/detr-resnet-50",
            "microsoft/conditional-detr-resnet-50",
        ],
        "inputs": [data_dir + "/image/cat_dog.jpg"] 
    }
}

# --------------------------
# 2. Helper Functions
# --------------------------
def load_pipeline(task, model_name):
    """Load a pipeline for a specific task and model."""
    try:
        return pipeline(task, model=model_name)
    except Exception as e:
        print(f"⚠️ Error loading {task} model '{model_name}': {e}")
        return None

def run_task(pipeline_fn, inputs, task_name):
    """Run inference and measure latency."""
    if pipeline_fn is None:
        return [{"error": "Model not loaded"} for _ in inputs]
    
    results = []
    latencies = []
    for input_data in inputs:
        try:
            start_time = time.time()
            output = pipeline_fn(input_data)
            latency = time.time() - start_time
            latencies.append(latency)
            results.append(output)
        except Exception as e:
            latencies.append(0)
            results.append({"error": str(e)})
    avg_latency = sum(latencies) / len(latencies) if latencies else 0
    return results, avg_latency

def check_local_model_path():
    """
    Checks if the local model path is present and accessible.
    """
    try:
        # Check if variable exists
        if 'local_model_path_dir' not in globals() and 'local_model_path_dir' not in locals():
            #print("\n⚠️ local_model_path_dir variable is not defined")
            return False
        
        # Check if variable is None or empty string
        if not local_model_path_dir:
            print("\n⚠️ local_model_path_dir is empty or None")
            return False
            
        # Check if path exists
        if not os.path.exists(local_model_path_dir):
            print(f"\n⚠️ Local model path does not exist: {local_model_path_dir}")
            return False
            
        return True
    except Exception as e:
        print(f"\n⚠️ Error while checking local_model_path_dir: {str(e)}")
        return False

# --------------------------
# 3. Run Comparisons
# --------------------------
if check_local_model_path():
    print(f"using pre-downloaded models, local_model_path_dir={local_model_path_dir}")
else:
    print(f"not using pre-downloaded models.")
 
all_results = {}
for task_name, task_data in tasks.items():
    print(f"🔍 Task: {task_name.upper()}")
    all_results[task_name] = {}
    
    for model_name in task_data["models"]:
        print(f"   → Model: {model_name}")
        # check to see if use local offline models
        if check_local_model_path():
            model_path = os.path.join(local_model_path_dir, model_name)
            pipeline_fn = load_pipeline(task_name, model_path)
        else:
            pipeline_fn = load_pipeline(task_name, model_name)
 
        inputs = task_data["inputs"]
        
        results, avg_latency = run_task(pipeline_fn, inputs, task_name)
        all_results[task_name][model_name] = {
            "outputs": results,
            "avg_latency_sec": avg_latency
        }
        
        # Print sample output (first input only)
        if results and "error" not in results[0]:
            print(f"      Sample Output: {results[0]}")
        else:
            print(f"      Error: {results[0].get('error', 'Unknown error')}")

# --------------------------
# 4. Display Summary (Optional)
# --------------------------
for task_name, task_results in all_results.items():
    print(f"\n📊 Summary for {task_name}:")
    print("-" * 80)
    
    # Calculate maximum column width
    max_model_len = max(len(model) for model in task_results.keys()) if task_results else 0
    max_model_len = max(max_model_len, 15)  # Minimum 15 characters width
    
    # Print table header
    header = f"{'Model':<{max_model_len}} | {'Avg Latency':^12} | {'Sample Output'}"
    print(header)
    print("-" * len(header))
    
    for model_name, data in task_results.items():
        latency = data["avg_latency_sec"]
        sample_output = data["outputs"][0] if data["outputs"] else "N/A"
        
        # Format output
        model_str = model_name.ljust(max_model_len)
        latency_str = f"{latency:.4f}s".center(12)
        sample_str = str(sample_output)[:100] + ("..." if len(str(sample_output)) > 100 else "")
        
        print(f"{model_str} | {latency_str} | {sample_str}")



env for: HF_ENDPOINT=not set
env for: PROJECT_HOME=not set
data_dir=.\homeworks/week1/transformers/data
using pre-downloaded models, local_model_path_dir=C:\Users\bill\hfd
🔍 Task: SENTIMENT-ANALYSIS
   → Model: tabularisai/multilingual-sentiment-analysis


Device set to use cpu


      Sample Output: [{'label': 'Very Positive', 'score': 0.49209871888160706}]
   → Model: ProsusAI/finbert


Device set to use cpu


      Sample Output: [{'label': 'neutral', 'score': 0.8351427316665649}]
   → Model: finiteautomata/bertweet-base-sentiment-analysis


emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0
Device set to use cpu
Device set to use cpu


      Sample Output: [{'label': 'POS', 'score': 0.9927709698677063}]
   → Model: boltuix/bert-emotion
      Sample Output: [{'label': 'love', 'score': 0.9541232585906982}]
   → Model: cardiffnlp/twitter-roberta-base-sentiment-latest


Some weights of the model checkpoint at C:\Users\bill\hfd\cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu
Device set to use cpu


      Sample Output: [{'label': 'positive', 'score': 0.9848045110702515}]
   → Model: distilbert/distilbert-base-uncased-finetuned-sst-2-english
      Sample Output: [{'label': 'POSITIVE', 'score': 0.9998855590820312}]
🔍 Task: NER
   → Model: dbmdz/bert-large-cased-finetuned-conll03-english


Some weights of the model checkpoint at C:\Users\bill\hfd\dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


      Sample Output: [{'entity': 'I-PER', 'score': np.float32(0.99915826), 'index': 1, 'word': 'Barack', 'start': 0, 'end': 6}, {'entity': 'I-PER', 'score': np.float32(0.999514), 'index': 2, 'word': 'Obama', 'start': 7, 'end': 12}, {'entity': 'I-LOC', 'score': np.float32(0.9992415), 'index': 6, 'word': 'Hawaii', 'start': 25, 'end': 31}, {'entity': 'I-LOC', 'score': np.float32(0.99937963), 'index': 10, 'word': 'Washington', 'start': 46, 'end': 56}, {'entity': 'I-LOC', 'score': np.float32(0.9992785), 'index': 11, 'word': 'D', 'start': 57, 'end': 58}, {'entity': 'I-LOC', 'score': np.float32(0.9994648), 'index': 13, 'word': 'C', 'start': 59, 'end': 60}]
   → Model: Jean-Baptiste/roberta-large-ner-english


Device set to use cpu


      Sample Output: [{'entity': 'PER', 'score': np.float32(0.9976821), 'index': 1, 'word': 'ĠBarack', 'start': 0, 'end': 6}, {'entity': 'PER', 'score': np.float32(0.99900657), 'index': 2, 'word': 'ĠObama', 'start': 7, 'end': 12}, {'entity': 'LOC', 'score': np.float32(0.9998404), 'index': 6, 'word': 'ĠHawaii', 'start': 25, 'end': 31}, {'entity': 'LOC', 'score': np.float32(0.9993886), 'index': 10, 'word': 'ĠWashington', 'start': 46, 'end': 56}, {'entity': 'LOC', 'score': np.float32(0.9989309), 'index': 11, 'word': 'ĠD', 'start': 57, 'end': 58}, {'entity': 'LOC', 'score': np.float32(0.9946662), 'index': 12, 'word': '.', 'start': 58, 'end': 59}, {'entity': 'LOC', 'score': np.float32(0.99861884), 'index': 13, 'word': 'C', 'start': 59, 'end': 60}, {'entity': 'LOC', 'score': np.float32(0.9946662), 'index': 14, 'word': '.', 'start': 60, 'end': 61}]
🔍 Task: QUESTION-ANSWERING
   → Model: deepset/tinyroberta-squad2


Device set to use cpu
Device set to use cpu


      Sample Output: {'score': 0.9824754595756531, 'start': 25, 'end': 31, 'answer': 'Hawaii'}
   → Model: distilbert/distilbert-base-cased-distilled-squad
      Sample Output: {'score': 0.9777411222457886, 'start': 25, 'end': 31, 'answer': 'Hawaii'}
🔍 Task: SUMMARIZATION
   → Model: Falconsai/text_summarization


Device set to use cpu
Your max_length is set to 200, but your input_length is only 43. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)
Your max_length is set to 200, but your input_length is only 28. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)


      Sample Output: [{'summary_text': 'Hubble Space Telescope is a large telescope in space . It was launched in 1990 and has taken stunning images of distant galaxies .'}]
   → Model: google-t5/t5-base


Device set to use cpu
Your max_length is set to 200, but your input_length is only 43. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=21)
Your max_length is set to 200, but your input_length is only 28. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)


      Sample Output: [{'summary_text': 'the Hubble Space Telescope is a large telescope in space . it was launched in 1990 and has taken stunning images of distant galaxies .'}]
🔍 Task: AUDIO-CLASSIFICATION
   → Model: MIT/ast-finetuned-audioset-10-10-0.4593


Device set to use cpu


      Sample Output: [{'score': 0.42077377438545227, 'label': 'Speech'}, {'score': 0.1793096959590912, 'label': 'Rain on surface'}, {'score': 0.130074605345726, 'label': 'Rain'}, {'score': 0.09597001224756241, 'label': 'Raindrop'}, {'score': 0.05782567337155342, 'label': 'Music'}, {'score': 0.03544803336262703, 'label': 'Male speech, man speaking'}, {'score': 0.02037866786122322, 'label': 'Narration, monologue'}, {'score': 0.0032378523610532284, 'label': 'Outside, urban or manmade'}, {'score': 0.0028150458820164204, 'label': 'Rustle'}, {'score': 0.0027943658642470837, 'label': 'Thunder'}, {'score': 0.002371752168983221, 'label': 'Inside, large room or hall'}, {'score': 0.002058965852484107, 'label': 'Run'}, {'score': 0.001928255776874721, 'label': 'Thunderstorm'}, {'score': 0.0015538015868514776, 'label': 'Boiling'}, {'score': 0.0012381412088871002, 'label': 'Television'}, {'score': 0.0010997636709362268, 'label': 'Speech synthesizer'}, {'score': 0.0010972226737067103, 'label': 'Outsid

Device set to use cpu


      Sample Output: [{'score': 0.4532127380371094, 'label': 'hap'}, {'score': 0.3622136116027832, 'label': 'sad'}, {'score': 0.09430024027824402, 'label': 'neu'}, {'score': 0.09027334302663803, 'label': 'ang'}]
🔍 Task: AUTOMATIC-SPEECH-RECOGNITION
   → Model: openai/whisper-tiny


Device set to use cpu


      Sample Output: {'text': ' I have a dream, but one day, this nation will rise up, live out the true meaning of its dream.'}
   → Model: openai/whisper-small


Device set to use cpu
Device set to use cpu


      Sample Output: {'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
🔍 Task: IMAGE-CLASSIFICATION
   → Model: Falconsai/nsfw_image_detection
      Sample Output: [{'label': 'normal', 'score': 0.9998505115509033}, {'label': 'nsfw', 'score': 0.00014946982264518738}]
   → Model: google/vit-base-patch16-224


Fast image processor class <class 'transformers.models.vit.image_processing_vit_fast.ViTImageProcessorFast'> is available for this model. Using slow image processor class. To use the fast image processor class set `use_fast=True`.
Device set to use cpu


      Sample Output: [{'label': 'collie', 'score': 0.2729230523109436}, {'label': 'papillon', 'score': 0.1813911646604538}, {'label': 'Border collie', 'score': 0.13854974508285522}, {'label': 'Japanese spaniel', 'score': 0.0916733592748642}, {'label': 'Shetland sheepdog, Shetland sheep dog, Shetland', 'score': 0.07215303927659988}]
🔍 Task: OBJECT-DETECTION
   → Model: facebook/detr-resnet-50


Some weights of the model checkpoint at C:\Users\bill\hfd\facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


      Sample Output: [{'score': 0.9984685778617859, 'label': 'cat', 'box': {'xmin': 78, 'ymin': 57, 'xmax': 309, 'ymax': 371}}, {'score': 0.9890327453613281, 'label': 'dog', 'box': {'xmin': 279, 'ymin': 20, 'xmax': 482, 'ymax': 416}}]
   → Model: microsoft/conditional-detr-resnet-50


Device set to use cpu


      Sample Output: [{'score': 0.7572169899940491, 'label': 'cat', 'box': {'xmin': 76, 'ymin': 55, 'xmax': 312, 'ymax': 372}}, {'score': 0.6198723316192627, 'label': 'dog', 'box': {'xmin': 274, 'ymin': 19, 'xmax': 484, 'ymax': 415}}]

📊 Summary for sentiment-analysis:
--------------------------------------------------------------------------------
Model                                                      | Avg Latency  | Sample Output
-----------------------------------------------------------------------------------------
tabularisai/multilingual-sentiment-analysis                |   0.1058s    | [{'label': 'Very Positive', 'score': 0.49209871888160706}]
ProsusAI/finbert                                           |   0.0530s    | [{'label': 'neutral', 'score': 0.8351427316665649}]
finiteautomata/bertweet-base-sentiment-analysis            |   0.0798s    | [{'label': 'POS', 'score': 0.9927709698677063}]
boltuix/bert-emotion                                       |   0.0244s    | [{'lab