In [5]:
! pip install -U torch datasets transformers[torch] wandb -q
! pip install scikit-learn



In [6]:
import torch
import pandas as pd
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, AutoModelForCausalLM, AutoConfig, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

In [7]:
df = pd.read_csv('assets/data/articles_cleaned.csv')[['content']]
df.head()

Unnamed: 0,content
0,الوالي الزاز كود العيون\n \nتلقى الرئيس الجزائ...
1,الوالي الزاز كود العيون\n \nبدا مستشار الرئيس ...
2,أنس العمري كود\nحذرت المملكة المتحدة مواطنيها ...
3,كود الرباط\nالسياسة فكولشي من الرياضة للثقافة...
4,سعيد الشاوي كود\nالعطلة الصيفية 2025 مزال مستم...


In [8]:
x_train, x_test = train_test_split(df, test_size=0.2, random_state=42)

x_train = x_train.reset_index(drop=True)
x_test = x_test.reset_index(drop=True)

dataset = DatasetDict({
    'train': Dataset.from_pandas(x_train),
    'test': Dataset.from_pandas(x_test)
})

In [9]:
model_id="HuggingFaceTB/SmolLM2-135M-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [10]:
# check the vocabe size of our tokenizer
print(f"{len(tokenizer)}")
# model max length (means the max len of the input)
print(f"{tokenizer.model_max_length}")
# tokenizer special tokens
print(tokenizer.special_tokens_map)

49152
8192
{'bos_token': '<|im_start|>', 'eos_token': '<|im_end|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|im_end|>', 'additional_special_tokens': ['<|im_start|>', '<|im_end|>']}


In [11]:
example="Salam ana othmane"
ids=tokenizer.encode(example)
print(ids)
tokens=tokenizer.convert_ids_to_tokens(ids)
print(tokens)
decode_=tokenizer.decode(ids)
print(decode_)

[20052, 332, 16143, 263, 373, 1483, 85]
['Sal', 'am', 'Ġana', 'Ġo', 'th', 'man', 'e']
Salam ana othmane


In [12]:
def tokenize(examples, context_length=128):
  results=tokenizer(
      examples["content"],
      truncation=True,
      max_length=context_length,
      return_overflowing_tokens=True, # with this you will get also the input ids with length less than context_length
      return_length=True
  )
  input_batch=[]
  for l,in_ids in zip(results["length"],results["input_ids"]):
    if l==context_length:
      input_batch.append(in_ids)
  return {"input_ids":input_batch}

In [13]:
tokenized_dataset = dataset.map(tokenize,batched=True,remove_columns=dataset["train"].column_names)

Map: 100%|██████████| 8143/8143 [00:12<00:00, 633.54 examples/s]
Map: 100%|██████████| 2036/2036 [00:03<00:00, 619.88 examples/s]


In [14]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=False)
config = AutoConfig.from_pretrained(model_id)
model = AutoModelForCausalLM.from_config(config)
args = TrainingArguments(
    output_dir="test_dir",
    num_train_epochs=2,
    per_device_eval_batch_size=8,
    per_device_train_batch_size=8,
    learning_rate=5e-4,
    warmup_steps=100,
    lr_scheduler_type="linear",
    save_steps=100,
    save_total_limit=2,
    fp16=False,
    logging_steps=2,
    push_to_hub=False,
    report_to="wandb",
)
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

  trainer = Trainer(


In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mo-azoubi-machacile[0m ([33mo-azoubi-machacile-mohammed-v-university-in-rabat[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
