# Persian Model 

## Import

In [16]:
#! pip install tokenizers scikit-learn --user 
#! pip install hazm --user 
#! pip install tiktoken --user 
#! pip install transformers --user
%load_ext autoreload
%autoreload 2

from tokenizers import Tokenizer
from tokenizers import trainers
from tokenizers.normalizers import StripAccents, Lowercase, Sequence
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer, UnigramTrainer
from tokenizers.models import BPE, Unigram
from transformers import  AutoTokenizer  #pipeline, GPT2LMHeadModel

from hazm import * 
import re

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Using device: cuda


## Main Part

In [5]:
from src.helper  import clean_pers_text_replace, get_cleaned_text
text_path = "content/fas_news_2020_100K/fas_news_2020_100K-sentences.txt"
path_to_save_folder= "model/train_data_pers"

raw_text = get_cleaned_text(text_path,clean_pers_text_replace)
#enc_text = tokenizer.encode(raw_text)



In:  100000  lines seperators replaced
Total lines replaced 95195
Total lines replaced 40842
Total lines replaced 1591


In [14]:
from src.dataset import GPTDataset
from src.dataset import create_dataloader


tokenizer = AutoTokenizer.from_pretrained('bolbolzaban/gpt2-persian')

# Parameters
batch_size = 128
context_length = 32  # Context size for training
vocab_size =  30000#tokenizer.n_vocab
embedding_dim = 128

train_dataloader, dev_dataloader, test_dataloader = create_dataloader(
    raw_text,tokenizer = tokenizer,allowed_special=False, batch_size=batch_size, 
    context_length=context_length, shuffle=True
)

 Create Dataset 2720000 / 2733504

## Training

In [23]:
from src.model import RegularizedLanguageModel
from src.trainComplete import TrainComplete
trainclass = TrainComplete(text_path = text_path,path_to_save_folder= path_to_save_folder,tokenizer = tokenizer,allowed_special=False)

model = RegularizedLanguageModel(vocab_size, embedding_dim, context_length, dropout=0.2).to(device)


trainclass.train(model,
              vocab_size,device,raw_text,"pers_standardLinear_ep4_batchsize64",
                print_every=75,evaluate_every=10000,optimizer=None,criterion=None,
              batch_size = 64,
              embedding_dim = 128,
              context_length = 32,
              num_epochs = 4
             )

 Create Dataset 2720000 / 2733504Epoch [1/4], Step [0/34168], Loss: 10.7367
Validation perplexity: 41141.40421065184
Epoch [1/4], Step [75/34168], Loss: 8.7246
Epoch [1/4], Step [150/34168], Loss: 7.3658
Epoch [1/4], Step [225/34168], Loss: 6.9710
Epoch [1/4], Step [300/34168], Loss: 6.9824
Epoch [1/4], Step [375/34168], Loss: 6.8438
Epoch [1/4], Step [450/34168], Loss: 6.6495
Epoch [1/4], Step [525/34168], Loss: 6.7453
Epoch [1/4], Step [600/34168], Loss: 6.5350
Epoch [1/4], Step [675/34168], Loss: 6.5044
Epoch [1/4], Step [750/34168], Loss: 6.4446
Epoch [1/4], Step [825/34168], Loss: 6.2616
Epoch [1/4], Step [900/34168], Loss: 6.2418
Epoch [1/4], Step [975/34168], Loss: 6.2203
Epoch [1/4], Step [1050/34168], Loss: 6.5285
Epoch [1/4], Step [1125/34168], Loss: 6.3570
Epoch [1/4], Step [1200/34168], Loss: 6.2309
Epoch [1/4], Step [1275/34168], Loss: 6.3496
Epoch [1/4], Step [1350/34168], Loss: 6.2279
Epoch [1/4], Step [1425/34168], Loss: 6.1301
Epoch [1/4], Step [1500/34168], Loss: 6.13

## Generate Text 

In [None]:
from src.model import generate_text
start_text = " من در راه"
for x in range(10):
    generated_text = generate_text(model, tokenizer, start_text, device=device, context_length=20)
    print(generated_text)

# More Training Cells 

##  Training

### Evening Training RUN

In [None]:
from src.model import RegularizedLanguageModel
from src.trainComplete import TrainComplete
from src.helper  import clean_pers_text_replace, get_cleaned_text,clean_pers_remove,clean_text_pers_both

raw_text = get_cleaned_text(text_path,clean_pers_remove)
trainclass = TrainComplete(text_path = text_path,path_to_save_folder= path_to_save_folder,tokenizer = tokenizer,allowed_special=False)

model = RegularizedLanguageModel(vocab_size, embedding_dim, context_length, dropout=0.2).to(device)


trainclass.train(model,
              vocab_size,device,raw_text,"pers_standardLinearNotRelu_ep4_evaluate10000_preprocessingRemove",
                print_every=75,evaluate_every=10000,optimizer=None,criterion=None,
              batch_size = 32,
              embedding_dim = 128,
              context_length = 32,
              num_epochs = 4
             )

In [None]:
from src.model import RegularizedLanguageModel
from src.trainComplete import TrainComplete
from src.helper  import clean_pers_text_replace, get_cleaned_text,clean_pers_remove,clean_text_pers_both

raw_text = get_cleaned_text(text_path,clean_text_pers_both)
trainclass = TrainComplete(text_path = text_path,path_to_save_folder= path_to_save_folder,tokenizer = tokenizer,allowed_special=False)

model = RegularizedLanguageModel(vocab_size, embedding_dim, context_length, dropout=0.2).to(device)


trainclass.train(model,
              vocab_size,device,raw_text,"pers_standardLinearNotRelu_ep4_evaluate10000_preprocessingBoth",
                print_every=75,evaluate_every=10000,optimizer=None,criterion=None,
              batch_size = 32,
              embedding_dim = 128,
              context_length = 32,
              num_epochs = 4
             )


raw_text = get_cleaned_text(text_path,clean_pers_text_replace)

### Current Training Runn 

In [None]:
from src.trainComplete import TrainComplete
from src.attentionModel import LanguageModelWithAttention

trainclass = TrainComplete(text_path = text_path,path_to_save_folder= path_to_save_folder,tokenizer = tokenizer,
                           allowed_special=False, is_attention_training = True)


context_length = 32  # Increased context size
embedding_dim = 128
attention_dim = 64
hidden_dim = 64
num_heads = 4

model = LanguageModelWithAttention(
    vocab_size, embedding_dim, attention_dim, context_length, hidden_dim, num_heads, dropout=0.2
).to(device)

trainclass.train(model,
              vocab_size,device,raw_text,"pers_attention_standard_dropout_ep10_eval10000",
                print_every=75,evaluate_every=10000,optimizer=None,criterion=None,
              batch_size = 32,
              embedding_dim = embedding_dim,
              context_length = context_length,
              num_epochs = 10
             )

 Create Dataset 2720000 / 2733504 2200000 / 2733504Started Training
Epoch [1/10], Step [0/68337], Loss: 10.2985
Validation perplexity: 29479.28083287785
Epoch [1/10], Step [75/68337], Loss: 7.2926
Epoch [1/10], Step [150/68337], Loss: 7.2566
Epoch [1/10], Step [225/68337], Loss: 7.1984
Epoch [1/10], Step [300/68337], Loss: 7.0778
Epoch [1/10], Step [375/68337], Loss: 7.0588
Epoch [1/10], Step [450/68337], Loss: 7.0533
Epoch [1/10], Step [525/68337], Loss: 7.0211
Epoch [1/10], Step [600/68337], Loss: 6.9933
Epoch [1/10], Step [675/68337], Loss: 7.1365
Epoch [1/10], Step [750/68337], Loss: 6.8697
Epoch [1/10], Step [825/68337], Loss: 6.8120
Epoch [1/10], Step [900/68337], Loss: 6.8621
Epoch [1/10], Step [975/68337], Loss: 6.9619
Epoch [1/10], Step [1050/68337], Loss: 6.8166
Epoch [1/10], Step [1125/68337], Loss: 6.7020
Epoch [1/10], Step [1200/68337], Loss: 6.5050
Epoch [1/10], Step [1275/68337], Loss: 6.8385
Epoch [1/10], Step [1350/68337], Loss: 6.6040
Epoch [1/10], Step [1425/68337], 

In [None]:
from src.trainComplete import TrainComplete
from src.attentionModel import LanguageModelWithAttention

trainclass = TrainComplete(text_path = text_path,path_to_save_folder= path_to_save_folder,tokenizer = tokenizer,
                           allowed_special=False, is_attention_training = True)


context_length = 32  # Increased context size
embedding_dim = 128
attention_dim = 64
hidden_dim = 64
num_heads = 4

model = LanguageModelWithAttention(
    vocab_size, embedding_dim, attention_dim, context_length, hidden_dim, num_heads, dropout=0.2
).to(device)

trainclass.train(model,
              vocab_size,device,raw_text,"pers_attention_standard_dropout_batchsize16_ep5_eval10000",
                print_every=75,evaluate_every=10000,optimizer=None,criterion=None,
              batch_size = 16,
              embedding_dim = embedding_dim,
              context_length = context_length,
              num_epochs = 5
             )

In [None]:
from src.trainComplete import TrainComplete
from src.attentionModel import LanguageModelWithAttention

trainclass = TrainComplete(text_path = text_path,path_to_save_folder= path_to_save_folder,tokenizer = tokenizer,
                           allowed_special=False, is_attention_training = True)


context_length = 32  # Increased context size
embedding_dim = 128
attention_dim = 64
hidden_dim = 64
num_heads = 4

model = LanguageModelWithAttention(
    vocab_size, embedding_dim, attention_dim, context_length, hidden_dim, num_heads, dropout=0.2
).to(device)

trainclass.train(model,
              vocab_size,device,raw_text,"pers_attention_standard_dropout_batchsize64_ep5_eval10000",
                print_every=75,evaluate_every=10000,optimizer=None,criterion=None,
              batch_size = 64,
              embedding_dim = embedding_dim,
              context_length = context_length,
              num_epochs = 5
             )