In [1]:
!pip install transformers torch protobuf tiktoken sentencepiece bitsandbytes==0.43.3 accelerate

Collecting transformers
  Downloading transformers-4.48.2-py3-none-any.whl (9.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m74.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting torch
  Downloading torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl (766.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m766.7/766.7 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting protobuf
  Downloading protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl (319 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.7/319.7 KB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinu

In [6]:
!pip install bitsandbytes -U

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.1-py3-none-manylinux_2_24_x86_64.whl (69.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.7/69.7 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: bitsandbytes
  Attempting uninstall: bitsandbytes
    Found existing installation: bitsandbytes 0.43.3
    Uninstalling bitsandbytes-0.43.3:
      Successfully uninstalled bitsandbytes-0.43.3
Successfully installed bitsandbytes-0.45.1


In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import math

In [2]:
model_name = "andreidima/Llama-2-7b-Romanian-qlora"

tokenizer = AutoTokenizer.from_pretrained(model_name, device_map='cuda:0')
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='cuda:0')

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors:   0%|          | 0.00/4.17G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/183 [00:00<?, ?B/s]

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096

In [4]:
def compute_metrics(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    input_ids = inputs["input_ids"]
    
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
    
    neg_log_likelihood = outputs.loss
    perplexity = math.exp(neg_log_likelihood.item())
    
    return [perplexity, neg_log_likelihood.item()]

In [5]:
import json
from tqdm import tqdm

In [6]:
regions = [
    'Ardeal',
    'Banat',
    'Bucovina',
    'Canada_EN',
    'Canada_Quebec',
    'Crisana',
    'Dobrogea',
    'Germania',
    'Italia',
    'Maramures',
    'Moldova',
    'Muntenia',
    'Oltenia',
    'Serbia',
    'Spania',
    'Ucraina',
    'UK'
]

In [10]:
import json
from tqdm import tqdm
import numpy as np

In [11]:
results_regions = {}
for region in regions:
    print(region)
    results_regions[region] = []
    results_content = {"perplexity": [], "neg_log_likelihood": []}
    results_titles = {"perplexity": [], "neg_log_likelihood": []}

    with open(f"dataset/{region}.json") as f:
        region_json = json.load(f)

    for row in tqdm(region_json):
        row_cnt_result = compute_metrics(model, tokenizer, row['content']  if 'content' in row else row['text'] )
        # print(row_cnt_result)
        results_content['perplexity'].append(row_cnt_result[0])
        results_content['neg_log_likelihood'].append(row_cnt_result[1])
        
        row_title_result = compute_metrics(model, tokenizer, row['title'] )
        results_titles['perplexity'].append(row_title_result[0])
        results_titles['neg_log_likelihood'].append(row_title_result[1])

    perp_content_mean = math.exp(np.array(results_content['neg_log_likelihood']).mean())
    perp_titles_mean = math.exp(np.array(results_titles['neg_log_likelihood']).mean())
    
    results_regions[region].append({'content': results_content, 'titles': results_titles, 'perp_mean_content': perp_content_mean, 'perp_mean_titles': perp_titles_mean})

Ardeal


100%|██████████| 1542/1542 [05:07<00:00,  5.02it/s]


Banat


100%|██████████| 1124/1124 [03:41<00:00,  5.07it/s]


Bucovina


100%|██████████| 428/428 [01:25<00:00,  4.99it/s]


Canada_EN


100%|██████████| 641/641 [02:07<00:00,  5.04it/s]


Canada_Quebec


100%|██████████| 47/47 [00:07<00:00,  5.94it/s]


Crisana


100%|██████████| 579/579 [01:49<00:00,  5.28it/s]


Dobrogea


100%|██████████| 965/965 [03:09<00:00,  5.09it/s]


Germania


100%|██████████| 500/500 [01:42<00:00,  4.89it/s]


Italia


100%|██████████| 12/12 [00:02<00:00,  4.92it/s]


Maramures


100%|██████████| 656/656 [01:58<00:00,  5.52it/s]


Moldova


100%|██████████| 6395/6395 [22:50<00:00,  4.67it/s]


Muntenia


100%|██████████| 2533/2533 [18:53<00:00,  2.24it/s]


Oltenia


100%|██████████| 4472/4472 [27:17<00:00,  2.73it/s]


Serbia


100%|██████████| 1134/1134 [03:46<00:00,  5.01it/s]


Spania


100%|██████████| 723/723 [02:23<00:00,  5.05it/s]


Ucraina


100%|██████████| 3010/3010 [10:00<00:00,  5.02it/s]


UK


100%|██████████| 499/499 [01:40<00:00,  4.95it/s]


In [12]:
with open("results_regions_qlora_ro.json", "w") as f:
    json.dump(results_regions, f, indent=4)