In [6]:
%%time
%%capture
!pip install peft
!pip install evaluate
!pip install datasets
!pip install transformers==4.51.3
!pip install sentencepiece
!pip install autoawq

CPU times: user 0 ns, sys: 74.9 ms, total: 74.9 ms
Wall time: 9.58 s


In [7]:
!pip show autoawq transformers

Name: autoawq
Version: 0.2.9
Summary: AutoAWQ implements the AWQ algorithm for 4-bit quantization with a 2x speedup during inference.
Home-page: https://github.com/casper-hansen/AutoAWQ
Author: Casper Hansen
Author-email: 
License: MIT
Location: /opt/conda/lib/python3.12/site-packages
Requires: accelerate, datasets, huggingface_hub, tokenizers, torch, transformers, triton, typing_extensions, zstandard
Required-by: 
---
Name: transformers
Version: 4.51.3
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /opt/conda/lib/python3.12/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: autoawq, peft


In [20]:
%%time
import os
import shutil
from pathlib import Path

import pandas as pd
import torch
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
)

CPU times: user 24 μs, sys: 16 μs, total: 40 μs
Wall time: 42.4 μs


In [9]:
print(torch.cuda.is_available())
print(torch.cuda.device_count())

True
1


In [10]:
%%time
model_name = "Qwen/Qwen2.5-14B-Instruct-AWQ"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, dtype=torch.float16, device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

CPU times: user 1.58 s, sys: 557 ms, total: 2.14 s
Wall time: 3.33 s


In [11]:
prompt = "Give me a short introduction to large language model."
messages = [
    {
        "role": "system",
        "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.",
    },
    {"role": "user", "content": prompt},
]
text = tokenizer.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
text

'<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nGive me a short introduction to large language model.<|im_end|>\n<|im_start|>assistant\n'

In [12]:
%%time
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
generated_ids = model.generate(**model_inputs, max_new_tokens=512)
generated_ids = [
    output_ids[len(input_ids) :]
    for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
response

'A large language model is an artificial intelligence system designed to understand and generate human-like text based on the input it receives. These models are typically trained on vast amounts of textual data from the internet and other sources, allowing them to learn patterns in language use, including syntax, semantics, and even contextually appropriate responses. They can perform a wide range of tasks such as answering questions, translating languages, writing stories, summarizing information, and much more. The "large" in their name refers to the size of the neural network, which can have billions of parameters, enabling these models to capture complex relationships within language and produce sophisticated outputs.'

In [15]:
dev_folder = "subtask1/dev/"
output_folder = "subtask_1"
os.makedirs(output_folder, exist_ok=True)

In [14]:
system_prompt = """You are an expert in social media content analysis, specializing in detecting polarization in online discourse.

Polarization is defined as sharp division and hostility between social, political, or identity groups. Polarized content typically:
- Creates "us vs. them" divisions
- Shows extreme hostility toward certain groups
- Uses inflammatory or divisive language
- Promotes rigid, uncompromising viewpoints
- Attacks or demonizes opposing groups
- Reinforces group boundaries and antagonism

Your task is to classify whether the given text contains polarized content or not."""


def create_prompt(text):
    user_prompt = f"""Analyze the following social media text and determine if it contains polarized content.

Text: "{text}"

Consider the overall meaning and context, not just individual words. Only classify as polarized if the text clearly reflects attitude polarization with division and hostility.

Respond with ONLY one word: "Yes" if the text is polarized, or "No" if it is not polarized."""

    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]

In [16]:
def classify_text(text):
    messages = create_prompt(text)
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    model_inputs = tokenizer([prompt], return_tensors="pt").to(model.device)

    with torch.no_grad():
        generated_ids = model.generate(**model_inputs, max_new_tokens=10)

    generated_ids = [
        output_ids[len(input_ids) :]
        for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
    response = (
        tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        .strip()
        .lower()
    )

    return 1 if "yes" in response else 0

In [21]:
%%time
csv_files = list(Path(dev_folder).glob("*.csv"))

for csv_file in csv_files:
    lang_code = csv_file.stem
    df = pd.read_csv(csv_file)

    predictions = []
    for text in tqdm(df["text"].tolist(), desc=f"Processing {lang_code}"):
        predictions.append(classify_text(text))

    output_df = pd.DataFrame({"id": df["id"], "polarization": predictions})
    output_df.to_csv(f"{output_folder}/pred_{lang_code}.csv", index=False)

Processing tel:   0%|          | 0/118 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Processing tel: 100%|██████████| 118/118 [00:28<00:00,  4.13it/s]
Processing hau: 100%|██████████| 182/182 [00:34<00:00,  5.24it/s]
Processing ita: 100%|██████████| 166/166 [00:32<00:00,  5.04it/s]
Processing swa: 100%|██████████| 349/349 [01:04<00:00,  5.38it/s]
Processing urd: 100%|██████████| 177/177 [00:35<00:00,  4.92it/s]
Processing tur: 100%|██████████| 115/115 [00:22<00:00,  5.00it/s]
Processing khm: 100%|██████████| 332/332 [01:30<00:00,  3.67it/s]
Processing hin: 100%|██████████| 137/137 [00:31<00:00,  4.39it/s]
Processing deu: 100%|██████████| 159/159 [00:29<00:00,  5.35it/s]
Processing mya: 100%|██████████| 144/144 [00:37<00:00,  3.83it/s]
Processing pan: 100%|██████████| 100/100 [00:19<00:00,  5.25it/s]
Processing ori: 100%|██████████| 118/118 [00:34<00:00,  3.38it/s]
P

CPU times: user 11min 5s, sys: 2min 5s, total: 13min 10s
Wall time: 13min 13s





In [22]:
shutil.make_archive("subtask_1", "zip", ".", "subtask_1")

'/home/jovyan/work/subtask_1.zip'