# Tokenization
    Tokenization is the process of breaking down text into smaller units. 
    
    These smaller units are called tokens.
    


> ![image.png](attachment:bae70f75-98e6-4249-a356-80b670872b1f.png) 

>### 1-Character tokenization.
        Here, the text is segmented into individual characters. 
        for languages that lack clear word boundaries or for tasks that require a granular analysis, such as spelling correction.
>
>### 2-Word tokenization. 
        This method breaks text down into individual words. 
        for languages with clear word boundaries like English.
>  
>### 3-Subword tokenization. 
         breaks text into units that might be larger than a single character but smaller than a full word. 
         For instance, "Chatbots" --> "Chat" and "bots".
![image.png](attachment:aee65fd5-fed0-446a-a656-0f28c76769a9.png) 

In [1]:
text = "Tokenization is the process of breaking down text into smaller units"
text_fa = "توکن‌سازی فرآیند شکستن متن به واحدهای کوچکتر است"

## 1- Character Tokenization

In [102]:
character_tokens = list(text_fa)
print( 'Tokens:',character_tokens) 

Tokens: ['ت', 'و', 'ک', 'ن', '\u200c', 'س', 'ا', 'ز', 'ی', ' ', 'ف', 'ر', 'آ', 'ی', 'ن', 'د', ' ', 'ش', 'ک', 'س', 'ت', 'ن', ' ', 'م', 'ت', 'ن', ' ', 'ب', 'ه', ' ', 'و', 'ا', 'ح', 'د', 'ه', 'ا', 'ی', ' ', 'ک', 'و', 'چ', 'ک', 'ت', 'ر', ' ', 'ا', 'س', 'ت']


In [103]:
Vocab =sorted(set(character_tokens))
print('Vocab:',Vocab)
print('\nVocab Size:',len(Vocab))

Vocab: [' ', 'آ', 'ا', 'ب', 'ت', 'ح', 'د', 'ر', 'ز', 'س', 'ش', 'ف', 'م', 'ن', 'ه', 'و', 'چ', 'ک', 'ی', '\u200c']

Vocab Size: 20


In [104]:
vocab_mappings = {token:idx for  idx, token  in enumerate(sorted(set(character_tokens)))}
print("Token Mappings:", vocab_mappings)

Token Mappings: {' ': 0, 'آ': 1, 'ا': 2, 'ب': 3, 'ت': 4, 'ح': 5, 'د': 6, 'ر': 7, 'ز': 8, 'س': 9, 'ش': 10, 'ف': 11, 'م': 12, 'ن': 13, 'ه': 14, 'و': 15, 'چ': 16, 'ک': 17, 'ی': 18, '\u200c': 19}


In [105]:
# Convert tokens to vocabulary indices
vocab_indices = [vocab_mappings[token] for token in character_tokens]
print("Token Indices:", vocab_indices)

Token Indices: [4, 15, 17, 13, 19, 9, 2, 8, 18, 0, 11, 7, 1, 18, 13, 6, 0, 10, 17, 9, 4, 13, 0, 12, 4, 13, 0, 3, 14, 0, 15, 2, 5, 6, 14, 2, 18, 0, 17, 15, 16, 17, 4, 7, 0, 2, 9, 4]


## 2- Word Tokenization

In [106]:
word_tokens = text.split() 
print('tokens:',word_tokens)

Vocab =sorted(set(word_tokens))
print('\nVocab:',Vocab)
print('\nVocab Size:',len(Vocab))

vocab_mappings = {token:idx for  idx, token  in enumerate(sorted(set(word_tokens)))}
print("\nToken Mappings:", vocab_mappings)

# Convert tokens to vocabulary indices
vocab_indices = [vocab_mappings[token] for token in word_tokens]
print("\nToken Indices:", vocab_indices)

tokens: ['Tokenization', 'is', 'the', 'process', 'of', 'breaking', 'down', 'text', 'into', 'smaller', 'units']

Vocab: ['Tokenization', 'breaking', 'down', 'into', 'is', 'of', 'process', 'smaller', 'text', 'the', 'units']

Vocab Size: 11

Token Mappings: {'Tokenization': 0, 'breaking': 1, 'down': 2, 'into': 3, 'is': 4, 'of': 5, 'process': 6, 'smaller': 7, 'text': 8, 'the': 9, 'units': 10}

Token Indices: [0, 4, 9, 6, 5, 1, 2, 8, 3, 7, 10]


In [4]:
word_tokens = text_fa.split() 

Vocab =sorted(set(word_tokens))
print('\nVocab:',Vocab)
print('\nVocab Size:',len(Vocab))

vocab_mappings = {token:idx for  idx, token  in enumerate(sorted(set(word_tokens)))}
print("\nToken Mappings:", vocab_mappings)

# Convert tokens to vocabulary indices
vocab_indices = [vocab_mappings[token] for token in word_tokens]
print("\nToken Indices:", vocab_indices)


Vocab: ['است', 'به', 'توکن\u200cسازی', 'شکستن', 'فرآیند', 'متن', 'واحدهای', 'کوچکتر']

Vocab Size: 8

Token Mappings: {'است': 0, 'به': 1, 'توکن\u200cسازی': 2, 'شکستن': 3, 'فرآیند': 4, 'متن': 5, 'واحدهای': 6, 'کوچکتر': 7}

Token Indices: [2, 4, 3, 5, 1, 6, 7, 0]


## 3-Sub-Word Tokenization
    these main implementations:
    
- 1. Byte-Pair Encoding Tokenization

- 2. Word-Piece Tokenization

- 3. Unigram Tokenization

In [5]:
# ! pip install transformers

https://huggingface.co/docs/transformers/v4.45.1/en/model_doc/auto#transformers.AutoTokenizer

In [107]:
from transformers import AutoTokenizer
bert_tokenizer   = AutoTokenizer.from_pretrained("bert-base-uncased") # WordPiece 
gpt_tokenizer    = AutoTokenizer.from_pretrained("gpt2") # Byte-Pair Encoding (BPE)
xlnet_tokenizer  = AutoTokenizer.from_pretrained("xlnet-base-cased") # Unigram

In [109]:
bert_tokenizer.tokenize('testing tokenization')

['testing', 'token', '##ization']

In [44]:
bert_tokenizer('testing tokenization', return_tensors="pt")

{'input_ids': tensor([[  101,  5604, 19204,  3989,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [49]:
bert_tokenizer.encode('testing tokenization')

[101, 5604, 19204, 3989, 102]

In [50]:
bert_tokenizer.decode([101, 5604, 19204, 3989, 102])

'[CLS] testing tokenization [SEP]'

In [52]:
bert_tokenizer.vocab_size

30522

In [8]:
gpt_tokenizer.vocab_size

50257

In [9]:
xlnet_tokenizer.vocab_size

32000

In [10]:
# bert_tokenizer.vocab

In [45]:
bert_tokenizer.all_special_tokens

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

In [111]:
# bert_tokenizer.vocab

In [47]:
text = "Tokenization is the process of breaking down text into smaller units"
text_fa = "توکن‌سازی فرآیند شکستن متن به واحدهای کوچکتر است"

- ## 3.1- WordPiece Tokenization

In [13]:
bert_tokens = bert_tokenizer.tokenize(text)
print(bert_tokens) 

['token', '##ization', 'is', 'the', 'process', 'of', 'breaking', 'down', 'text', 'into', 'smaller', 'units']


In [14]:
inputs = bert_tokenizer(text, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101, 19204,  3989,  2003,  1996,  2832,  1997,  4911,  2091,  3793,
          2046,  3760,  3197,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [15]:
input_ids = inputs['input_ids']  # Token IDs
print(input_ids)
# BERT adds automatically:
# 101 is the special [CLS] token  
# 102 is the [SEP] token  

tensor([[  101, 19204,  3989,  2003,  1996,  2832,  1997,  4911,  2091,  3793,
          2046,  3760,  3197,   102]])


In [16]:
print(bert_tokenizer.convert_ids_to_tokens(input_ids.squeeze()))

['[CLS]', 'token', '##ization', 'is', 'the', 'process', 'of', 'breaking', 'down', 'text', 'into', 'smaller', 'units', '[SEP]']


In [17]:
bert_tokens_fa = bert_tokenizer.tokenize(text_fa)
print('Token: ',bert_tokens_fa) 
inputs = bert_tokenizer(text_fa, return_tensors="pt")
input_ids = inputs['input_ids']  # Token IDs
print('\nIDs : ',input_ids)

Token:  ['ت', '##و', '##ک', '##ن', '##س', '##ا', '##ز', '##ی', 'ف', '##ر', '##ا', '##ی', '##ن', '##د', 'ش', '##ک', '##س', '##ت', '##ن', 'م', '##ت', '##ن', 'ب', '##ه', 'و', '##ا', '##ح', '##د', '##ه', '##ا', '##ی', 'ک', '##و', '##چ', '##ک', '##ت', '##ر', 'ا', '##س', '##ت']

IDs :  tensor([[  101,  1273, 29836, 29841, 15915, 29824, 25573, 29823, 24830,  1291,
         17149, 25573, 24830, 15915, 15394,  1283, 29841, 29824, 29817, 15915,
          1295, 29817, 15915,  1271, 14157,  1298, 25573, 29820, 15394, 14157,
         25573, 24830,  1304, 29836, 29840, 29841, 29817, 17149,  1270, 29824,
         29817,   102]])


In [18]:
print(bert_tokenizer.convert_ids_to_tokens(input_ids.squeeze()))

['[CLS]', 'ت', '##و', '##ک', '##ن', '##س', '##ا', '##ز', '##ی', 'ف', '##ر', '##ا', '##ی', '##ن', '##د', 'ش', '##ک', '##س', '##ت', '##ن', 'م', '##ت', '##ن', 'ب', '##ه', 'و', '##ا', '##ح', '##د', '##ه', '##ا', '##ی', 'ک', '##و', '##چ', '##ک', '##ت', '##ر', 'ا', '##س', '##ت', '[SEP]']


- ## 3.2- Byte-Pair Encoding (BPE) Tokenization

In [19]:
gpt_token = gpt_tokenizer.tokenize(text)
print('Token: ',gpt_token) 
inputs = gpt_tokenizer(text, return_tensors="pt")
input_ids = inputs['input_ids']  # Token IDs
print('\nIDs : ',input_ids)

Token:  ['Token', 'ization', 'Ġis', 'Ġthe', 'Ġprocess', 'Ġof', 'Ġbreaking', 'Ġdown', 'Ġtext', 'Ġinto', 'Ġsmaller', 'Ġunits']

IDs :  tensor([[30642,  1634,   318,   262,  1429,   286,  7163,   866,  2420,   656,
          4833,  4991]])


In [20]:
print(gpt_tokenizer.convert_ids_to_tokens(input_ids.squeeze()))

['Token', 'ization', 'Ġis', 'Ġthe', 'Ġprocess', 'Ġof', 'Ġbreaking', 'Ġdown', 'Ġtext', 'Ġinto', 'Ġsmaller', 'Ġunits']


In [21]:
gpt_token = gpt_tokenizer.tokenize(text_fa)
print('Token: ',gpt_token)
inputs = gpt_tokenizer(text_fa, return_tensors="pt")
input_ids = inputs['input_ids']  # Token IDs
print('\nIDs : ',input_ids)

Token:  ['Øª', 'ÙĪ', 'Ú', '©', 'ÙĨ', 'âĢ', 'Į', 'Ø³', 'Ø§Ø', '²', 'Û', 'Į', 'ĠÙ', 'ģ', 'Ø±', 'Ø', '¢', 'Û', 'Į', 'ÙĨ', 'Ø¯', 'ĠØ', '´', 'Ú', '©', 'Ø³', 'Øª', 'ÙĨ', 'ĠÙħ', 'Øª', 'ÙĨ', 'ĠØ', '¨', 'Ùĩ', 'ĠÙĪ', 'Ø§Ø', 'Ń', 'Ø¯', 'Ùĩ', 'Ø§', 'Û', 'Į', 'Ġ', 'Ú', '©', 'ÙĪ', 'Ú', 'Ĩ', 'Ú', '©', 'Øª', 'Ø±', 'Ġ', 'Ø§Ø', '³', 'Øª']

IDs :  tensor([[41486, 30335,   150,   102, 23338,   447,   234, 45692, 34247,   110,
           151,   234, 18923,   223, 26897,   148,    95,   151,   234, 23338,
         38843, 17550,   112,   150,   102, 45692, 41486, 23338, 47048, 41486,
         23338, 17550,   101, 29519, 42092, 34247,   255, 38843, 29519, 12919,
           151,   234,   220,   150,   102, 30335,   150,   228,   150,   102,
         41486, 26897,   220, 34247,   111, 41486]])


In [22]:
print(gpt_tokenizer.convert_ids_to_tokens(input_ids.squeeze()))

['Øª', 'ÙĪ', 'Ú', '©', 'ÙĨ', 'âĢ', 'Į', 'Ø³', 'Ø§Ø', '²', 'Û', 'Į', 'ĠÙ', 'ģ', 'Ø±', 'Ø', '¢', 'Û', 'Į', 'ÙĨ', 'Ø¯', 'ĠØ', '´', 'Ú', '©', 'Ø³', 'Øª', 'ÙĨ', 'ĠÙħ', 'Øª', 'ÙĨ', 'ĠØ', '¨', 'Ùĩ', 'ĠÙĪ', 'Ø§Ø', 'Ń', 'Ø¯', 'Ùĩ', 'Ø§', 'Û', 'Į', 'Ġ', 'Ú', '©', 'ÙĪ', 'Ú', 'Ĩ', 'Ú', '©', 'Øª', 'Ø±', 'Ġ', 'Ø§Ø', '³', 'Øª']


- ## 3.3- Unigram Tokenization

In [23]:
xlnet_token = xlnet_tokenizer.tokenize(text)
print('Token: ',xlnet_token)
inputs = xlnet_tokenizer(text, return_tensors="pt")
input_ids = inputs['input_ids']  # Token IDs
print('\nIDs : ',input_ids)

Token:  ['▁To', 'ken', 'ization', '▁is', '▁the', '▁process', '▁of', '▁breaking', '▁down', '▁text', '▁into', '▁smaller', '▁units']

IDs :  tensor([[ 324, 4190, 1822,   27,   18,  465,   20, 4610,  151, 1758,   91, 2171,
         2043,    4,    3]])


In [24]:
print(xlnet_tokenizer.convert_ids_to_tokens(input_ids.squeeze()))

['▁To', 'ken', 'ization', '▁is', '▁the', '▁process', '▁of', '▁breaking', '▁down', '▁text', '▁into', '▁smaller', '▁units', '<sep>', '<cls>']


In [25]:
xlnet_token = xlnet_tokenizer.tokenize(text_fa)
print('Token: ',xlnet_token)
inputs = xlnet_tokenizer(text_fa, return_tensors="pt")
input_ids = inputs['input_ids']  # Token IDs
print('\nIDs : ',input_ids)

Token:  ['▁', 'توکن', '▁', 'س', 'ا', 'زی', '▁', 'فر', 'ا', 'یند', '▁', 'شکستن', '▁', 'متن', '▁', 'به', '▁', 'و', 'ا', 'حده', 'ا', 'ی', '▁', 'کوچکتر', '▁', 'ا', 'ست']

IDs :  tensor([[   17,     0,    17,     0, 23200,     0,    17,     0, 23200,     0,
            17,     0,    17,     0,    17,     0,    17,     0, 23200,     0,
         23200,     0,    17,     0,    17, 23200,     0,     4,     3]])


In [26]:
print(xlnet_tokenizer.convert_ids_to_tokens(input_ids.squeeze()))

['▁', '<unk>', '▁', '<unk>', 'ا', '<unk>', '▁', '<unk>', 'ا', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', 'ا', '<unk>', 'ا', '<unk>', '▁', '<unk>', '▁', 'ا', '<unk>', '<sep>', '<cls>']


## Great Farsi Tokenizer

In [46]:
from transformers import   AutoTokenizer
bert_farsi_tokenizer = AutoTokenizer.from_pretrained('HooshvareLab/bert-fa-zwnj-base')

 #### ParsBERT : https://github.com/hooshvare/parsbert

In [28]:
bertfa__token = bert_farsi_tokenizer.tokenize(text_fa)
print('Token: ',bertfa__token)

inputs = bert_farsi_tokenizer(text_fa, return_tensors="pt")
input_ids = inputs['input_ids']  # Token IDs
print('\nIDs : ',input_ids)

Token:  ['توکن', '[ZWNJ]', 'سازی', '[UNK]', 'شکستن', 'متن', 'به', 'واحدهای', 'کوچکتر', 'است']

IDs :  tensor([[    2, 12762,     9,  2606,     1, 10054,  3092,  1923,  6136,  7673,
          1933,     3]])


In [29]:
print(bert_farsi_tokenizer.convert_ids_to_tokens(input_ids.squeeze()))

['[CLS]', 'توکن', '[ZWNJ]', 'سازی', '[UNK]', 'شکستن', 'متن', 'به', 'واحدهای', 'کوچکتر', 'است', '[SEP]']


In [30]:
bertfa__token = bert_farsi_tokenizer.tokenize(text)
print('Token: ',bertfa__token)
inputs = bert_farsi_tokenizer(text, return_tensors="pt")
input_ids = inputs['input_ids']  # Token IDs
print('\nIDs : ',input_ids)

Token:  ['Tok', '##en', '##ization', 'is', 'the', 'process', 'of', 'break', '##ing', 'down', 'text', 'into', 'sm', '##alle', '##r', 'un', '##its']

IDs :  tensor([[    2, 28962,  2834, 16425,  9122,  5166, 26353,  5025, 40402,  3600,
         26871, 27267, 31370, 24516, 36487,  1139, 16760, 20174,     3]])


In [31]:
print(bert_farsi_tokenizer.convert_ids_to_tokens(input_ids.squeeze()))

['[CLS]', 'Tok', '##en', '##ization', 'is', 'the', 'process', 'of', 'break', '##ing', 'down', 'text', 'into', 'sm', '##alle', '##r', 'un', '##its', '[SEP]']


In [53]:
print(bert_farsi_tokenizer.encode(text))

[2, 28962, 2834, 16425, 9122, 5166, 26353, 5025, 40402, 3600, 26871, 27267, 31370, 24516, 36487, 1139, 16760, 20174, 3]


In [54]:
print(bert_farsi_tokenizer.decode([2, 28962, 2834, 16425, 9122, 5166, 26353, 5025, 40402, 3600, 26871, 27267, 31370, 24516, 36487, 1139, 16760, 20174, 3]))

[CLS] Tokenization is the process of breaking down text into smaller units [SEP]


 ## Final Note

In [32]:
xlnet_token = xlnet_tokenizer.tokenize(text)
print('Token: ',xlnet_token)
inputs = xlnet_tokenizer(text, return_tensors="pt")
input_ids = inputs['input_ids']  # Token IDs
print('\nIDs : ',input_ids)

Token:  ['▁To', 'ken', 'ization', '▁is', '▁the', '▁process', '▁of', '▁breaking', '▁down', '▁text', '▁into', '▁smaller', '▁units']

IDs :  tensor([[ 324, 4190, 1822,   27,   18,  465,   20, 4610,  151, 1758,   91, 2171,
         2043,    4,    3]])


In [33]:
print(bert_farsi_tokenizer.convert_ids_to_tokens(input_ids.squeeze()))

['ˀ', 'مدتی', '##∴', '[U18]', '[U9]', 'ћ', '[U11]', 'ماج', 'L', '##း', '[U82]', 'هستند', 'کنند', '[MASK]', '[SEP]']


In [34]:
print(bert_tokenizer.convert_ids_to_tokens(input_ids.squeeze()))

['[unused319]', 'leg', '定', '[unused26]', '[unused17]', '[unused460]', '[unused19]', 'economy', '[unused146]', '仁', '[unused90]', 'name', 'when', '[unused3]', '[unused2]']


In [35]:
print(gpt_tokenizer.convert_ids_to_tokens(input_ids.squeeze()))

['ad', 'Ġhair', 'Ġarg', '<', '3', 'Ġhis', '5', 'Ġsolution', 'Û', 'ape', '|', 'ills', 'IT', '%', '$']


In [36]:
print(xlnet_tokenizer.convert_ids_to_tokens(input_ids.squeeze()))

['▁To', 'ken', 'ization', '▁is', '▁the', '▁process', '▁of', '▁breaking', '▁down', '▁text', '▁into', '▁smaller', '▁units', '<sep>', '<cls>']


# Using in Dataset

In [None]:
# Dataset Class
class WordDataset(Dataset):
    def __init__(self, text, seq_length, tokenizer):
        self.text = text
        self.seq_length = seq_length
        self.tokenizer = tokenizer
        
        # Tokenize and convert to token IDs
        self.tokens = self.tokenizer(text)['input_ids']
        
        # Use the tokenizer's vocabulary directly
        self.vocab_size = len(self.tokenizer.get_vocab())
        
        # Prepare data as a sequence of indices
        self.data = self.tokens

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        seq = self.data[idx:idx + self.seq_length]
        target = self.data[idx + 1:idx + self.seq_length + 1]
        return torch.tensor(seq, dtype=torch.long), torch.tensor(target, dtype=torch.long)

In [73]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('HooshvareLab/bert-fa-zwnj-base')

# Load the text data
with open('ali_karimi.txt', 'r', encoding='utf-8') as f:
    ali_karimi = f.read()

# Initialize dataset and dataloader
seq_length = 100
batch_size = 64
dataset = WordDataset(ali_karimi, seq_length, tokenizer)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)


Token indices sequence length is longer than the specified maximum sequence length for this model (14657 > 512). Running this sequence through the model will result in indexing errors


In [86]:
x,y = next(iter(dataloader))
x

tensor([[    2,  3305, 12176,  ...,  1114,   595, 11537],
        [ 3305, 12176,   115,  ...,   595, 11537,   223],
        [12176,   115,  3906,  ..., 11537,   223,   166],
        ...,
        [ 6096,   623,   595,  ...,  9993,   592,  1923],
        [  623,   595,  7996,  ...,   592,  1923,  2179],
        [  595,  7996,  1916,  ...,  1923,  2179,   209]])

In [92]:
item0 = (x[0]) 
item0

tensor([    2,  3305, 12176,   115,  3906,     9,  1983,  1923,  2118, 10420,
        12176, 16065,  2235,   592,   166,   694,   168,  3898,   595,  2284,
        10068,   116,  4801,     1,  3500,  4409,  2141,   623,  1925,  8622,
         4435,  2765,  3017,  1933,   121,  2025,  1921,  3255,  3500,  3699,
            9,  1970,  3135,     1,  1969,  1925,  1996, 16237,  1980, 11309,
          590,     1,  2183,  1921,  4518,     9,  1941,  2141,   590,  7018,
          590,  6096,   623,   595,  7996,  1916,  1937,  2039,   121, 12176,
         1932,  1923,     9,  2179,  1961,  2183,     9,  2109, 32820,     9,
         3152,  3303,  1924,     9,  1993,   590,  1927,  7301,   209, 17393,
          223,   166,   695,   168,   623,   209, 20059,  1114,   595, 11537])

In [93]:
print(tokenizer.convert_ids_to_tokens(item0))

['[CLS]', 'علی', 'کریمی', '(', 'زاده', '[ZWNJ]', 'شده', 'به', 'نام', 'محمدعلی', 'کریمی', 'پاشا', '##کی', '؛', '[', '۳', ']', '۱۷', 'آ', '##بان', '۱۳۵۷', ')', 'بازیکن', '[UNK]', 'فوتبال', 'اهل', 'ایران', 'و', 'از', 'مخالفان', 'نظام', 'جمهوری', 'اسلامی', 'است', '.', 'او', 'در', 'دوران', 'فوتبال', 'حرفه', '[ZWNJ]', 'ای', '۱۸', '[UNK]', 'خود', 'از', 'سال', '۱۳۷۵', 'تا', '۱۳۹۳', '،', '[UNK]', 'بازی', 'در', 'لیگ', '[ZWNJ]', 'های', 'ایران', '،', 'امارات', '،', 'قطر', 'و', 'آ', '##لم', '##ان', 'را', 'دارد', '.', 'کریمی', 'که', 'به', '[ZWNJ]', 'عنوان', 'یک', 'بازی', '[ZWNJ]', 'ساز', 'دریبل', '[ZWNJ]', 'زن', 'شناخته', 'می', '[ZWNJ]', 'شود', '،', 'با', 'لقب', '«', 'جادوگر', '»', '[', '۴', ']', 'و', '«', 'مارادونا', '##ی', 'آ', '##سیا']
