### This notebook serves the purpose of running everithing in one go - train, save the model where you want, resume training, see the predictions etc.


In [1]:
device = 'cuda:0'

In [2]:
from baseline_preprocess_input import *
from models import *
from train import *


In [3]:
with open('data/sme-freecorpus.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# clean very special char
text = text.replace("¶", "").replace('•', '').replace('□', '').replace('§', '').replace('\uf03d', '').replace('π', '').replace('●', '').replace('µ', '').replace('º', '').replace('文', '').replace('中', '').replace('⅞', '').replace('½', '').replace('⅓', '').replace('¾', '').replace('¹', '').replace('³', '').replace('\t', '')
# remove numbers
text = re.sub(r'[0-9]+', '', text)
# remove russian texts (it is in data)
text = re.sub(r"[А-Яа-я]", '', text) 
# remove puctuation
text = re.sub(r"[^\w\s]", "", text) 

# encode the text 
# 1. int2char, integers to characters
# 2. char2int, characters to unique integers
chars = tuple(sorted(set(text))) 
int2char = dict(enumerate(chars))
char2int = {ch: ii for ii, ch in int2char.items()}

# encode the text
encoded = np.array([char2int[ch] for ch in text])

In [4]:
n_hidden=756
n_layers=3
# default values 
drop_prob = 0.5
lr=0.001
bidirectional = False
use_embeddings = True
is_gru = False
emb_dim = 128
# load one of the models from models.py
model = LSTM(chars, device, is_gru, bidirectional, use_embeddings, emb_dim, n_hidden, n_layers)
print(model)

LSTM(
  (emb): Embedding(224, 128)
  (rnn): LSTM(128, 756, num_layers=3, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=756, out_features=224, bias=True)
)


In [5]:
batch_size = 64
seq_length = 300
n_epochs = 1 # small because for testing

# train the model
train_and_save(model, encoded, device, model_name='lstm_test.pt', epochs=n_epochs, batch_size=batch_size, seq_length=seq_length, lr=0.0001, resume_from_saved=False, bidirectional=bidirectional, use_embeddings=use_embeddings)

Starting to train lstm_test.pt...
Epoch: 1... Loss: 1.4159...


In [5]:
opt = torch.optim.Adam(model.parameters(), lr=0.0001) 

In [7]:
from predict import * 

model, _ , _, _, _, _  = load_checkpoint("lstm_test.pt", model, opt)

show_sample(model, 2000, device, use_embeddings=use_embeddings, prime='ja', top_k=5)

'ja laht dea    ddera din   mahea má láli di geat galat  ma de lusa meast  didin  ge muoht  d   lasi did lu dea   mahuolidasea gea mera mi ddi didi ge gin  la  ddinin d lasalaha gu  latas litalá lera  mahkkkitahtiddas  mea goa lári d da   d la  dd ditta d  geastta mala dit  leatidina leralea maleatahuva láláhkta gatas mite dasi da  ge  d   linda da  dalaláht mide gas dasease luola  den ddddas  ddini lit la lui da   dalalat dati guseas lasteattahtasi lera mahtina d lase  la di lele mustt mahkii luohkuoa  me  govvva dalusalaht liinddelahkalahus maláreláhkahuolea dasaláin lasidditat dea lalas  meahat merealuvat  ddeaheliteahat mahas dahelahta mitt  ga  geahtas lat luselu diiit  guovole ddasaha  luvdeahkid     li deatat luoli mita  di  mea ga ga den   gi leaht la ga da ma lasididahuohala miin da  guorea goli  di ma luohkuidaht goa  ddin muoht gi delá gii  di guoluolat  lahtin mu midaliiiden  ddit gidea mi  geatiti laht den  gelin  meatida láida lálái gii   gat ma   ddena di ma ma dalaleatt