In [None]:
!pip install -U datasets

In [2]:
!pip install --force-reinstall charset-normalizer==3.1.0

Collecting charset-normalizer==3.1.0
  Using cached charset_normalizer-3.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (195 kB)
Installing collected packages: charset-normalizer
  Attempting uninstall: charset-normalizer
    Found existing installation: charset-normalizer 3.1.0
    Uninstalling charset-normalizer-3.1.0:
      Successfully uninstalled charset-normalizer-3.1.0
Successfully installed charset-normalizer-3.1.0


In [2]:
from datasets import load_dataset

In [3]:
dataset = load_dataset("arbml/ashaar")

Found cached dataset parquet (/home/g201080740/.cache/huggingface/datasets/arbml___parquet/MagedSaeed--ashaar-719bb58a76ea0092/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
subset = dataset['train'].select(range(1000))

In [5]:
subset

Dataset({
    features: ['poem title', 'poem meter', 'poem verses', 'poem theme', 'poem url', 'poet name', 'poet description', 'poet url', 'poet era', 'poet location', 'poem description', 'poem language type'],
    num_rows: 1000
})

In [6]:
from pyarabic.araby import strip_tashkeel
import re 
from poetry_diacritizer.util.constants import *
from diacritization_evaluation.util import extract_haraqat

def check_percentage_tashkeel(sample, perc = 0.95):
    sample = re.sub(r'['+''.join(PUNCTUATIONS)+']', ' ', sample)
    sample = re.sub(' +', '', sample)
    try:
        input, plain, harakat = extract_haraqat(sample)
        plain   = [char for char in plain if char not in 'اويى']
        harakat = [harakah for harakah in harakat if len(harakah) > 0]
    except:
        return False
    
    if len(plain) == 0:
        return False
    return len(harakat)/len(plain) > perc

def filter_by_tashkeel(sample):
    out = []
    for bayt in sample['poem verses']:
        if check_percentage_tashkeel(bayt):
            out.append(bayt)
    return {'text': ' '.join(out)}

In [7]:
columns = [k for k in dataset['train'].features]

In [8]:
cleaned = dataset.map(filter_by_tashkeel, remove_columns = columns)

Loading cached processed dataset at /home/g201080740/.cache/huggingface/datasets/arbml___parquet/MagedSaeed--ashaar-719bb58a76ea0092/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-08090717de7913cb.arrow


In [9]:
filtered = cleaned.filter(lambda example: len(example['text']) > 10)

Loading cached processed dataset at /home/g201080740/.cache/huggingface/datasets/arbml___parquet/MagedSaeed--ashaar-719bb58a76ea0092/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-019cdc4d6a0b1138.arrow


In [10]:
filtered

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 26091
    })
})

In [11]:
filtered['train'][5000]['text']

'فَهَزَّ رُدَينِيّاً كَأَنَّ كُعوبَهُ'

In [12]:
from datasets import DatasetDict

def generate_splits(dataset):
    train_testvalid = dataset['train'].train_test_split(test_size=0.1,seed = 41)
    test_valid = train_testvalid['test'].train_test_split(test_size=0.5,seed = 41)
    train_test_valid_dataset = DatasetDict({
      'train': train_testvalid['train'],
      'test': test_valid['test'],
      'valid': test_valid['train']})
    return train_test_valid_dataset

In [13]:
filtered['train'].features

{'text': Value(dtype='string', id=None)}

In [14]:
def save_dataset_csv(dataset, path = 'data/CA_MSA'):
    dataset['train'].to_csv(f'{path}/train.csv', encoding='utf-8', header= False, index = False)
    dataset['valid'].to_csv(f'{path}/eval.csv', encoding='utf-8', header= False, index = False)
    dataset['test'].to_csv(f'{path}/test.csv', encoding='utf-8', header= False, index = False)

In [15]:
splitted_dataset = generate_splits(filtered)

Loading cached split indices for dataset at /home/g201080740/.cache/huggingface/datasets/arbml___parquet/MagedSaeed--ashaar-719bb58a76ea0092/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-0cad275139025ba0.arrow and /home/g201080740/.cache/huggingface/datasets/arbml___parquet/MagedSaeed--ashaar-719bb58a76ea0092/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-a03014c1092d0636.arrow
Loading cached split indices for dataset at /home/g201080740/.cache/huggingface/datasets/arbml___parquet/MagedSaeed--ashaar-719bb58a76ea0092/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-0ce351534c515fbc.arrow and /home/g201080740/.cache/huggingface/datasets/arbml___parquet/MagedSaeed--ashaar-719bb58a76ea0092/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-769b7593e9a60401.arrow


In [16]:
splitted_dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 23481
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1305
    })
    valid: Dataset({
        features: ['text'],
        num_rows: 1305
    })
})

In [17]:
splitted_dataset.push_to_hub('Zaid/Ashaar_diacritized')

Pushing split train to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split test to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing split valid to the Hub.


Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

In [108]:
!mkdir -p data/ashaar_proc

In [109]:
save_dataset_csv(splitted_dataset, path = 'data/ashaar_proc')

Creating CSV from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

In [19]:
splitted_dataset['test'][0]

{'text': 'قَمَرِيَّةٌ قُمرِيَّةٌ'}