In [1]:
!pip install accelerate bitsandbytes
!pip install --upgrade transformers accelerate bitsandbytes



In [2]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig
from typing import List, Tuple
import torch
import bitsandbytes as bnb
import gc

2025-07-08 19:29:40.540320: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752002980.563806     140 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752002980.570761     140 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
model_name = "sileod/deberta-v3-base-tasksource-nli"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
results = {}

# Full Precision

In [6]:
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats(0)

pipe = pipeline(
    "zero-shot-classification",
    model=model_name,
    tokenizer=tokenizer,
    device_map="auto"
)

allocated_mb = torch.cuda.max_memory_allocated(0) / (1024**2)
reserved_mb = torch.cuda.max_memory_reserved(0) / (1024**2)
results["full"] = {
    "allocated": allocated_mb,
    "reserved": reserved_mb
}
print("Full Precision")
print(f"- Peak VRAM Allocated: {allocated_mb:.2f} MB")
print(f"- Peak VRAM Reserved: {reserved_mb:.2f} MB")

del pipe

Device set to use cuda:0


Full Precision
- Peak VRAM Allocated: 703.53 MB
- Peak VRAM Reserved: 724.00 MB


# Half Precision

In [7]:
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats(0)

pipe = pipeline(
    "zero-shot-classification",
    model=model_name,
    tokenizer=tokenizer,
    device_map="auto",
    torch_dtype=torch.float16
)

allocated_mb = torch.cuda.max_memory_allocated(0) / (1024**2)
reserved_mb = torch.cuda.max_memory_reserved(0) / (1024**2)
results["half"] = {
    "allocated": allocated_mb,
    "reserved": reserved_mb
}
print("Half Precision")
print(f"- Peak VRAM Allocated: {allocated_mb:.2f} MB")
print(f"- Peak VRAM Reserved: {reserved_mb:.2f} MB")

del pipe

Device set to use cuda:0


Half Precision
- Peak VRAM Allocated: 563.65 MB
- Peak VRAM Reserved: 728.00 MB


# BNB Quantized

In [8]:
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats(0)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=BitsAndBytesConfig(load_in_4bit=True),
    device_map="auto",
    torch_dtype="auto"
)

pipe = pipeline(
    "zero-shot-classification",
    model=model,
    tokenizer=tokenizer,
    device_map="auto"
)

allocated_mb = torch.cuda.max_memory_allocated(0) / (1024**2)
reserved_mb = torch.cuda.max_memory_reserved(0) / (1024**2)
print("Quantized (int4)")
print(f"- Peak VRAM Allocated: {allocated_mb:.2f} MB")
print(f"- Peak VRAM Reserved: {reserved_mb:.2f} MB")
results["quant"] = {
    "allocated": allocated_mb,
    "reserved": reserved_mb
}
quantized_layers = [m for m in model.modules() if isinstance(m, bnb.nn.Linear8bitLt)]
print(f"Quantized layers found: {len(quantized_layers)}")

del pipe
del model

Device set to use cuda:0


Quantized (int4)
- Peak VRAM Allocated: 440.08 MB
- Peak VRAM Reserved: 748.00 MB
Quantized layers found: 0


In [9]:
gc.collect()
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats(0)

full_allocated, full_reserved = results["full"]["allocated"], results["full"]["reserved"]
half_allocated, half_reserved = results["half"]["allocated"], results["half"]["reserved"]
quant_allocated, quant_reserved = results["quant"]["allocated"], results["quant"]["reserved"]

print(f"Full (fp32): {100*full_allocated / full_allocated:.2f}% (Allocated {full_allocated:.2f} MB, Reserved: {full_reserved:.2f} MB)")
print(f"Half (fp16): {100*half_allocated / full_allocated:.2f}% (Allocated {half_allocated:.2f} MB, Reserved: {half_reserved:.2f} MB)")
print(f"Quant (int4): {100*quant_allocated / full_allocated:.2f}% (Allocated {quant_allocated:.2f} MB, Reserved: {quant_reserved:.2f} MB)")

Full (fp32): 100.00% (Allocated 703.53 MB, Reserved: 724.00 MB)
Half (fp16): 80.12% (Allocated 563.65 MB, Reserved: 728.00 MB)
Quant (int4): 62.55% (Allocated 440.08 MB, Reserved: 748.00 MB)


# Inference results comparison

In [16]:
from typing import List, Optional
import json
import time
import random
import unicodedata
import re
import emoji


class ZeroShotTagger:
    def __init__(
        self,
        threshold: float = 0.8,
        torch_dtype: Optional[torch.dtype] = None,
        bnb_config: Optional[BitsAndBytesConfig] = None
    ):
        '''
        Initializing the tag classifier

        Args:
            threshold: Confidence threshold for tags
            torch_dtype: Datatype the model will use (ignored if bnb_config is passed)
            bnb_config: BitsAndBytes quantization config
        '''
        self.threshold = threshold

        self._model_name = "sileod/deberta-v3-base-tasksource-nli"

        self._tokenizer = AutoTokenizer.from_pretrained(
            self._model_name,
            model_max_length=512
        )

        self._model = AutoModelForSequenceClassification.from_pretrained(
            self._model_name,
            quantization_config=bnb_config,
            torch_dtype="auto" if bnb_config is not None else torch_dtype,
            device_map="auto"
        )

        self.classifier = pipeline(
            "zero-shot-classification",
            model=self._model,
            tokenizer=self._tokenizer,
            device_map="auto"
        )

        # Should be obtained from the database
        self.tags = [
            # --- Primary Event & Opportunity Types ---
            "Workshop",
            "Lecture",
            "Seminar",
            "Talk",
            "Conference",
            "Forum",
            "Hackathon",
            "Olympiad",
            "Contest",
            "Festival",
            "Job Fair",
            "Master Class",
            "Club Meeting",
            "Ball",
            "Concert",
            "Party",
            "Quiz",
            "Game",
            "Internship",
            "Volunteering",
            # --- Common Topics ---
            "Programming",
            "Artificial Intelligence",
            "Computer Science",
            "Machine Learning",
            "Data Science",
            "Cybersecurity",
            "Robotics",
            "Science",
            "Mathematics",
            "Physics",
            "Business",
            "Startups",
            "Design",
            "Art",
            "Music",
            "Dance",
            "Sports",
            "Language Learning"
        ]

    def _preprocess_description(self, event_description: str) -> str:
        '''
        Performs text normalization and cleaning.
        Removes emojis, non-text elements, URLs, and non-standard characters.
        Supports Unicode normalization, HTML tag removal, and whitespace reduction.

        Args:
            text: Raw input string to be processed

        Returns:
            str: Normalized and cleaned text
        '''
        event_description = emoji.replace_emoji(event_description, replace='')
        event_description = unicodedata.normalize('NFKC', event_description)
        event_description = re.sub(r'<[^>]+>', '', event_description)
        event_description = re.sub(r'https?://\S+|www\.\S+', '', event_description)
        event_description = re.sub(r'[^\w\s.,!?;:()"\'-]', '', event_description)
        event_description = re.sub(r'\s+', ' ', event_description).strip()
        return event_description

    def predict(self, event_description: str, tag_list: List = []) -> List[str]:
        '''
        Identifies relevant tags for an event description using zero-shot classification.
        Returns tags above the specified threshold.

        Args:
            event_description (str): Text description of the event to classify

        Returns:
            list: List of tag filtered by threshold,
                sorted by descending confidence (order from classifier output)
        '''
        if len(tag_list) == 0:
            tag_list = self.tags
        # Preprocessing a description
        event_description = self._preprocess_description(event_description)

        result = self.classifier(
            event_description,
            candidate_labels=tag_list,
            truncation=True,
            padding="max_length",
            multi_label=True
        )

        # Filtering tags by threshold
        relevant_tags = [
            tag
            for tag, score in zip(result['labels'], result['scores'])
            if score >= self.threshold
        ]

        return relevant_tags

In [24]:
model_full = ZeroShotTagger(threshold=0.75, torch_dtype=torch.float32)
model_half = ZeroShotTagger(threshold=0.75, torch_dtype=torch.float16)
model_quant = ZeroShotTagger(threshold=0.75, bnb_config=BitsAndBytesConfig(load_in_4bit=True))

Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0


In [12]:
with open("../data/o4u_preprocessed_messages_Jun_07_2025.json", "r", encoding="utf-8") as f:
    posts = json.loads(f.read())

In [25]:
_ = model_full.predict(posts[0])
_ = model_half.predict(posts[0])
_ = model_quant.predict(posts[0])

def benchmark(post):
    start = time.time()
    full_tags = model_full.predict(post)
    full_time = time.time() - start

    start = time.time()
    half_tags = model_half.predict(post)
    half_time = time.time() - start
    
    start = time.time()
    quant_tags = model_quant.predict(post)
    quant_time = time.time() - start

    print(post)
    print(f"Full: {full_tags} ({1000*full_time:.2f} ms)\nHalf: {half_tags} ({1000*half_time:.2f} ms)\nQuant: {quant_tags} ({1000*quant_time:.2f} ms)\n{30*'-'}")

    return full_time, half_time, quant_time

random.seed(42)
post_samples = random.sample(posts, 15)
timings = [benchmark(post) for post in post_samples]
avg_full = sum(t[0] for t in timings) / len(timings)
avg_half = sum(t[1] for t in timings) / len(timings)
avg_quant = sum(t[2] for t in timings) / len(timings)

print(f"\nAverage Times:\nFull: {avg_full:.3f}s | Half: {avg_half:.3f}s | Quant: {avg_quant:.3f}s")

International Fest: International Quest "Inheritance" by InnoQuest Your Russian uncle left you a huge inheritance, but... He set one condition. You will have to travel all over the world to get untold riches Assemble a team of 5-7 people and set off in pursuit of the inheritance The number of teams is limited Fyr fyr fyr March 10, 18:00 University Register your command here
Full: ['Game', 'Festival'] (769.28 ms)
Half: ['Game', 'Festival'] (834.48 ms)
Quant: ['Game'] (1535.33 ms)
------------------------------
INNOPOLIS UNIVERSITY vs INNOPOLIS LYCEUM For the first time students' team will face the guys from lyceum! Experience or creativity - what's more important to win the game? Today we'll find it out in a 11x11 match! Come and support our team, it's really important for every player! And don't forget to wear warm clothes! Today, 17:00, Football pitch
Full: ['Sports'] (795.44 ms)
Half: ['Sports'] (820.51 ms)
Quant: ['Sports'] (1437.82 ms)
------------------------------
3 new cases hav