In [1]:
!pip install transformers datasets



Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [8]:
from datasets import load_dataset

# Load first 1000 examples from Arabic OSCAR dataset (fast training)
dataset = load_dataset("oscar", "unshuffled_deduplicated_ar", split="train[:100]")

# Filter out very short samples
dataset = dataset.filter(lambda example: len(example["text"]) > 20)


Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

In [9]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Prevents padding errors

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [10]:
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))


Embedding(50257, 768)

In [12]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results_arabic_fast",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=1,
    logging_steps=10,
    logging_dir="./logs_arabic",
)


In [13]:
from transformers import Trainer, DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

trainer.train()


Step,Training Loss
10,2.5877
20,2.5377
30,2.4646
40,2.4463
50,2.4888


TrainOutput(global_step=50, training_loss=2.505022201538086, metrics={'train_runtime': 292.0117, 'train_samples_per_second': 0.342, 'train_steps_per_second': 0.171, 'total_flos': 6532300800000.0, 'train_loss': 2.505022201538086, 'epoch': 1.0})

In [14]:
model.save_pretrained("./gpt2_arabic_demo")
tokenizer.save_pretrained("./gpt2_arabic_demo")


('./gpt2_arabic_demo/tokenizer_config.json',
 './gpt2_arabic_demo/special_tokens_map.json',
 './gpt2_arabic_demo/vocab.json',
 './gpt2_arabic_demo/merges.txt',
 './gpt2_arabic_demo/added_tokens.json')

In [16]:
from transformers import pipeline

generator = pipeline("text-generation", model="./gpt2_arabic_demo", tokenizer="./gpt2_arabic_demo")

# Example Arabic prompt
prompt = "الذكاء الاصطناعي هو"
result = generator(prompt, max_length=50, num_return_sequences=1)

print(result[0]["generated_text"])


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


الذكاء الاصطناعي هواض المنت ٱاساتا؂ الخبين عحزيدم ك
