In [39]:
import os
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
import pandas as pd
import pyarabic.araby as araby
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:

# Load pretrained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("aubmindlab/aragpt2-base")
model = TFGPT2LMHeadModel.from_pretrained("aubmindlab/aragpt2-base",from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFGPT2LMHeadModel: ['h.5.attn.masked_bias', 'h.0.attn.masked_bias', 'h.6.attn.masked_bias', 'h.8.attn.masked_bias', 'h.1.attn.masked_bias', 'h.10.attn.masked_bias', 'h.7.attn.masked_bias', 'h.4.attn.masked_bias', 'h.3.attn.masked_bias', 'h.9.attn.masked_bias', 'h.11.attn.masked_bias', 'h.2.attn.masked_bias']
- This IS expected if you are initializing TFGPT2LMHeadModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFGPT2LMHeadModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was tra

In [4]:
# read the data 
data = pd.read_csv('Arabic_poetry_dataset.csv')
# shuffle the data 
data = data.sample(frac=1)
# select the first 500 rows
data = data.sample(500)
data

Unnamed: 0,id,category,poet_name,poem_title,poem_text
22523,22697,العصر العباسي,الأحنف العكبري,تنبه أيها,تنبّه أيّها\nوصمّم إن طول النوم عار\nتوقر عن م...
52032,52206,العصر المملوكي,صفي الدين الحلي,لي صديق لا يعرف الصدق في القو,لي صَديقٌ لا يَعرِفُ الصِدقَ في القَو\nلِ وَلي...
27467,27641,العصر الايوبي,القاضي الفاضل,سأل اللوى وسؤاله تعليل,سَأَلَ اللوى وَسُؤالُهُ تَعليلُ\nوَمِنَ المُحا...
41268,41442,العصر الاموي,ذو الرمة,بيضاء صفراء قد تنازعها,بَيضاءُ صَفراءُ قَد تَنازَعَها\nلَونانِ مِن فِ...
39589,39763,العصر الاموي,جرير,بحري قومي هيجي الأحزانا,بَحَرِيَّ قومي هَيِّجي الأَحزانا\nوَاِستَعجِلِ...
...,...,...,...,...,...
1777,1884,العراق,بهاء الدين الصيادي,إذا ما صغت في المحبوب نظما,إذا ما صِغْتُ في المَحْبوبِ نَظْماً\nيُساعِدُن...
15783,15957,العصر العباسي,ابن الرومي,أيسير مدحي في الأمير وكله,أيسيرُ مدحي في الأمير وكلُّهُ\nيا للرجال مُؤرّ...
51020,51194,العصر المملوكي,لسان الدين بن الخطيب,لم يبق لي جود الولاية حاجة,لَمْ يُبْقِ لِي جُودُ الْوِلايَةِ حَاجَةً\nفِي...
8044,8217,لبنان,أبو الفضل الوليد,ليالي النوى حتام ترخين برقعا,ليالي النّوى حتّامَ ترخينَ برقعا\nلينتابني ذكر...


In [18]:
# read the poems into a list
lines = data['poem_text'].values.tolist()

In [19]:
lines[1]

'لي صَديقٌ لا يَعرِفُ الصِدقَ في القَو\nلِ وَليسَ الصَديقُ إِلّا الصَدوقُ\nلَيسَ فيهِ تَصَوُّرٌ يُدرِكُ العِل\nمَ وَلا لي إِن قُلتُهُ تَصديقُ'

In [21]:
# join all poems into one string
lines = "\n".join(lines)
# split the string into a list of lines
lines = lines.split("\n")
lines[1]

'وصمّم إن طول النوم عار'

In [22]:
len(lines)

12148

In [25]:
# remove diacritics from the lines
lines = [araby.strip_diacritics(line) for line in lines]

In [26]:
# convert the lines into sequences of tokens
tokenized_lines = [tokenizer.encode(line, add_special_tokens=True) for line in lines]


In [33]:
lines[1]

'وصمم إن طول النوم عار'

In [32]:
tokenized_lines[1]

[1257, 1765, 588, 5951, 6059, 16046]

In [34]:
# generate input sequences and labels
input_sequences = [line[:-1] for line in tokenized_lines]
labels = [line[1:] for line in tokenized_lines]


In [36]:
input_sequences[1]

[1257, 1765, 588, 5951, 6059]

In [37]:
labels[1]

[1765, 588, 5951, 6059, 16046]

In [40]:
# Find the maximum sequence length
seq_max_length = max(len(seq) for seq in input_sequences)

In [41]:
# find the maximum label length
label_max_length = max(len(seq) for seq in labels)

In [45]:
# Pad the sequences to the maximum length
padded_sequences = pad_sequences(input_sequences, maxlen=seq_max_length, padding='post')


In [46]:
padded_labels = pad_sequences(labels, maxlen=label_max_length, padding='post')

In [47]:
padded_sequences

array([[ 3614,   949,     0, ...,     0,     0,     0],
       [ 1257,  1765,   588, ...,     0,     0,     0],
       [  273, 61067,   394, ...,     0,     0,     0],
       ...,
       [  273, 12026, 23179, ...,     0,     0,     0],
       [23475,   601, 42163, ...,     0,     0,     0],
       [ 2840,  5829, 43347, ...,     0,     0,     0]])

In [49]:
padded_labels

array([[  949,  7085,     0, ...,     0,     0,     0],
       [ 1765,   588,  5951, ...,     0,     0,     0],
       [61067,   394,   224, ...,     0,     0,     0],
       ...,
       [12026, 23179, 62076, ...,     0,     0,     0],
       [  601, 42163,   284, ...,     0,     0,     0],
       [ 5829, 43347,   537, ...,     0,     0,     0]])

In [50]:
# Convert input sequences and labels to TensorFlow datasets
input_dataset = tf.data.Dataset.from_tensor_slices(padded_sequences)
labels_dataset = tf.data.Dataset.from_tensor_slices(padded_labels)

# Combine input and label datasets
dataset = tf.data.Dataset.zip((input_dataset, labels_dataset))


In [20]:
# Shuffle and batch the dataset
batch_size = 16
dataset = dataset.shuffle(buffer_size=len(input_sequences))
dataset = dataset.batch(batch_size)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),metrics=['accuracy'])

# Fine-tuning
model.fit(dataset, epochs=3)


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1dfb3391988>

In [25]:
seed_test= "وحلفت ان"
input_ids = tokenizer.encode(seed_test, return_tensors='tf')
sample_outputs = model.generate(
    input_ids,
    do_sample=True,
    max_length=50,
    top_k=0,
    top_p=0.9,
    temperature=0.8,
    num_return_sequences=1
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


In [26]:
print("Output:", tokenizer.decode(sample_outputs[0], skip_special_tokens=True))


Output: وحلف انك لا تخون عدوا لك وإن كنتم لا تخون عدوا بل عدوا فعقابكم عليه ومن عاداهم فلا تخون بهم عدوهم وإن كنت عدوا فانقصوا العهد واجمعوا على قود الأعداء وأنوكوا على مقودهم


In [27]:
# save model
model.save_pretrained('gpt2_fine_tuned_model_arabic')
# save tokenizer
tokenizer.save_pretrained('gpt2_tokenizer_fine_tuned_model_arabic')

('gpt2_tokenizer_fine_tuned_model_arabic\\tokenizer_config.json',
 'gpt2_tokenizer_fine_tuned_model_arabic\\special_tokens_map.json',
 'gpt2_tokenizer_fine_tuned_model_arabic\\vocab.json',
 'gpt2_tokenizer_fine_tuned_model_arabic\\merges.txt',
 'gpt2_tokenizer_fine_tuned_model_arabic\\added_tokens.json')