In [1]:
!pip install transformers torch protobuf tiktoken sentencepiece bitsandbytes==0.43.3 accelerate

Collecting transformers
  Downloading transformers-4.48.2-py3-none-any.whl (9.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m57.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting torch
  Downloading torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl (766.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m766.7/766.7 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting protobuf
  Downloading protobuf-5.29.3-cp38-abi3-manylinux2014_x86_64.whl (319 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.7/319.7 KB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m103.1 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylin

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import math

In [4]:
model_name = "OpenLLM-Ro/RoLlama2-7b-Base"

tokenizer = AutoTokenizer.from_pretrained(model_name, device_map='cuda:0')
model = AutoModelForCausalLM.from_pretrained(model_name, device_map='cuda:0')

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((4096,), eps=1e-05)
    (rotary_e

In [6]:
def compute_metrics(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)
    input_ids = inputs["input_ids"]
    
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
    
    neg_log_likelihood = outputs.loss
    perplexity = math.exp(neg_log_likelihood.item())
    
    return [perplexity, neg_log_likelihood.item()]

In [7]:
import json
from tqdm import tqdm

In [8]:
regions = [
    'Balti',
    'Cahul',
    'Calarasi',
    'Causeni',
    'Comrat',
    'Criuleni',
    'Hincesti',
    'Ialoveni',
    'Orhei',
    'Sangerei',
    'Soroca',
    'Ungheni'
]
print(len(regions))

12


In [9]:
import json
from tqdm import tqdm
import numpy as np

In [10]:
results_regions = {}
for region in regions:
    print(region)
    results_regions[region] = []
    results_content = {"perplexity": [], "neg_log_likelihood": []}
    results_titles = {"perplexity": [], "neg_log_likelihood": []}

    with open(f"dataset_md/{region}.json") as f:
        region_json = json.load(f)

    for row in tqdm(region_json):
        row_cnt_result = compute_metrics(model, tokenizer, row['content']  if 'content' in row else row['text'] )
        # print(row_cnt_result)
        results_content['perplexity'].append(row_cnt_result[0])
        results_content['neg_log_likelihood'].append(row_cnt_result[1])
        
        row_title_result = compute_metrics(model, tokenizer, row['title'] )
        results_titles['perplexity'].append(row_title_result[0])
        results_titles['neg_log_likelihood'].append(row_title_result[1])

    perp_content_mean = math.exp(np.array(results_content['neg_log_likelihood']).mean())
    perp_titles_mean = math.exp(np.array(results_titles['neg_log_likelihood']).mean())
    
    results_regions[region].append({'content': results_content, 'titles': results_titles, 'perp_mean_content': perp_content_mean, 'perp_mean_titles': perp_titles_mean})

Balti


100%|██████████| 948/948 [01:48<00:00,  8.73it/s]


Cahul


100%|██████████| 504/504 [03:00<00:00,  2.80it/s]


Calarasi


100%|██████████| 511/511 [04:10<00:00,  2.04it/s]


Causeni


100%|██████████| 321/321 [02:21<00:00,  2.27it/s]


Comrat


100%|██████████| 179/179 [00:12<00:00, 13.83it/s]


Criuleni


100%|██████████| 509/509 [03:55<00:00,  2.16it/s]


Hincesti


100%|██████████| 20/20 [00:02<00:00,  8.78it/s]


Ialoveni


100%|██████████| 504/504 [03:31<00:00,  2.39it/s]


Orhei


100%|██████████| 512/512 [03:45<00:00,  2.27it/s]


Sangerei


100%|██████████| 775/775 [05:29<00:00,  2.35it/s]


Soroca


100%|██████████| 504/504 [03:05<00:00,  2.72it/s]


Ungheni


100%|██████████| 1023/1023 [08:00<00:00,  2.13it/s]


In [11]:
with open("results_regions_md.json", "w") as f:
    json.dump(results_regions, f, indent=4)