# Train a language model from scratch

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
from fastai.text import *

In [None]:
torch.cuda.set_device(0)

In [None]:
torch.cuda.empty_cache() 

Prior to going on with this notebook, please run the batch script "prepare_wiki.bat", to download and extract Wikipedia content in the desired language.

## Preparing the data

We start by creating data for the Language Model (LM). LM's goal is to learn the structure of a language. It learns language by trying to predict the next word given a set of previous words (n-grams). Since the LM does not perform classification, labels can be ignored.

In [None]:
lang = "it"

In [None]:
DATA_PATH = Path('../lm/data/wiki/' + lang)
DATA_PATH.mkdir(exist_ok=True)

In [None]:
MODEL_PATH= DATA_PATH/'models/'
MODEL_PATH.mkdir(exist_ok=True)

Let's use a smaller dataset to understand the concepts. If you wish to run on the full dataset, define debug as empty string.

In [None]:
debug = "_debug"

### 1. Tokenization

In this section, we start cleaning up the messy text. There are 2 main activities we need to perform:

1. Clean up extra spaces, tab chars, new line chars and other characters and replace them with standard ones
2. Use the awesome [spacy](http://spacy.io) library to tokenize the data. Since spacy does not provide a parallel/multicore version of the tokenizer, the fastai library adds this functionality. This parallel version uses all the cores of your CPUs and runs much faster than the serial version of the spacy tokenizer.

Tokenization is the process of splitting the text into separate tokens so that each token can be assigned a unique index. This means we can convert the text into integer indexes our models can use.

We use an appropriate chunksize as the tokenization process is memory intensive

Spacy does support the following languages: 'en', 'de', 'es', 'pt', 'fr', 'it', 'nl'. **If you want to train a LM in an unsupported language, you need to provide and implement a proper tokenizer.**

In [None]:
try:
    spacy.load(lang)
except OSError:
    print(f'spacy tokenization model is not installed for {lang}.')
    lang = lang if lang in ['en', 'de', 'es', 'pt', 'fr', 'it', 'nl'] else 'xx'
    print(f'Command: python -m spacy download {lang}')

In [None]:
# See: https://stackoverflow.com/questions/753052/strip-html-from-strings-in-python
class HTMLTextExtractor(html.parser.HTMLParser):
    def __init__(self):
        super(HTMLTextExtractor, self).__init__()
        self.result = [ ]

    def handle_data(self, d):
        self.result.append(d)

    def get_text(self):
        return ''.join(self.result)
    
    def error(self, message):
        return

In [None]:
def html_to_text(html):
    s = HTMLTextExtractor()    
    try:
        s.feed(html)
        return s.get_text()
    except:
        return html

In [None]:
def custom_tagstrip(x:str) -> str:
    "Remove all html tags in `x`."
    return html_to_text(x)

In [None]:
def get_texts(df, n_lbls=1, lang='en'):
    if len(df.columns) == 1:
        labels = []
        texts = f'\n{BOS} {FLD} 1 ' + df[0].astype(str)
    else:
        labels = df.iloc[:,range(n_lbls)].values.astype(np.int64)
        texts = f'\n{BOS} {FLD} 1 ' + df[n_lbls].astype(str)
        for i in range(n_lbls+1, len(df.columns)): texts += f' {FLD} {i-n_lbls} ' + df[i].astype(str)
    
    pre_rules = [custom_tagstrip] + defaults.text_pre_rules
    tok = Tokenizer(lang=lang, pre_rules=pre_rules, n_cpus=1).process_all(texts)
    
    return tok, list(labels)

In [None]:
def get_all(df, n_lbls=1, lang='en'):
    tok, labels = [], []
    for i, r in enumerate(df):
        print(i)
        tok_, labels_ = get_texts(r, n_lbls, lang)
        tok += tok_;
        labels += labels_
    return tok, labels

In [None]:
chunksize=24000

In [None]:
tmp_path = (DATA_PATH/'tmp')
tmp_path.mkdir(exist_ok=True)

In [None]:
df_trn = pd.read_csv(DATA_PATH/f'train{debug}.csv', header=None, chunksize=chunksize)

In [None]:
tok_trn, trn_labels = get_all(df_trn, lang)

In [None]:
np.save(tmp_path/f'train_tok{debug}.npy', tok_trn)
np.save(tmp_path/f'train_lbl{debug}.npy', trn_labels)

In [None]:
trn_joined = [' '.join(o) for o in tok_trn]
open(tmp_path/f'train_joined{debug}.txt', 'w', encoding='utf-8').writelines(trn_joined)

In [None]:
df_val = pd.read_csv(DATA_PATH/f'val{debug}.csv', header=None, chunksize=chunksize)

In [None]:
tok_val, val_labels = get_all(df_val, lang)

In [None]:
np.save(tmp_path/f'valid_tok{debug}.npy', tok_val)
np.save(tmp_path/f'valid_lbl{debug}.npy', val_labels)

In [None]:
val_joined = [' '.join(o) for o in tok_val]
open(tmp_path/f'valid_joined{debug}.txt', 'w', encoding='utf-8').writelines(val_joined)

### 2. Mapping tokens to ids

Forward model

In [None]:
max_vocab = 60000
min_freq = 2

In [None]:
print("Loading tokenized texts...")
trn_tok = np.load(tmp_path/f'train_tok{debug}.npy')
val_tok = np.load(tmp_path/f'valid_tok{debug}.npy')

print("Calculating token frequencies...")
freq = Counter(p for o in trn_tok for p in o)

print("Filtering tokens...")
itos = [o for o,c in freq.most_common(max_vocab) if c>min_freq]
itos.insert(0, '_pad_')
itos.insert(0, '_unk_')
stoi = collections.defaultdict(lambda:0, {v:k for k,v in enumerate(itos)})

print(f'Total tokens: {len(itos)-2}+2')

print(f'Mapping training set...')
trn_lm = np.array([[stoi[o] for o in p] for p in trn_tok])
np.save(tmp_path/f'train_ids{debug}.npy', trn_lm)

print(f'Mapping validation set...')
val_lm = np.array([[stoi[o] for o in p] for p in val_tok])
np.save(tmp_path/f'valid_ids{debug}.npy', val_lm)

print(f'Saving token-id map...')
pickle.dump(itos, open(tmp_path/f'itos{debug}.pkl', 'wb'))

Backward model

In [None]:
def _partition_cols(a,idxs):
  i=0
  for idx in idxs:
      yield a[i:idx]
      i=idx
  yield a[i:]

In [None]:
def partition_cols(a,idxs): return list(_partition_cols(a,idxs))

In [None]:
def reverse_flds(t, fld_id):
    t = np.array(t)
    idxs = np.nonzero(t==fld_id)[0]
    parts = partition_cols(t,idxs)[1:]
    reversed = np.concatenate([np.concatenate([o[:2],o[:1:-1]]) for o in parts[::-1]])
    return reversed

In [None]:
bwd_trn_path = tmp_path/f'train_ids_bwd{debug}.npy'
bwd_val_path = tmp_path/f'valid_ids_bwd{debug}.npy'

In [None]:
stoi = {s: i for i, s in enumerate(itos)}
fld_id = stoi[FLD]

print("Reversing training ids order...")
bwd_trn = np.array([reverse_flds(o, fld_id) for o in trn_lm])
np.save(bwd_trn_path, bwd_trn)

print("Reversing validation ids order...")
bwd_val = np.array([reverse_flds(o, fld_id) for o in val_lm])
np.save(bwd_val_path, bwd_val)

### 3. Pre-train the language model

We have to use a special kind of TextDataBunch for the language model, that ignores the labels (that's why we put 0 everywhere), will shuffle the texts at each epoch before concatenating them all together (only for training, we don't shuffle for the validation set) and will send batches that read that text in order with targets that are the next word in the sentence.

In [None]:
bs = 35

In [None]:
data = TextLMDataBunch.load(DATA_PATH, bs=bs)

In [None]:
learn = language_model_learner(data, bptt = 70, emb_sz = 400, nh = 1150, nl = 3,
                               drop_mult = 0.05, alpha = 2, beta = 1)

In [None]:
learn.metrics = [accuracy]

In [None]:
learn.opt_func = partial(optim.Adam, betas=(0.8, 0.99))

Let's find an optimal learning rate:

In [None]:
lr = 1e-1
wd = 1e-3
clip = 0.15
cycle_len = 12

In [None]:
learn.clip = clip
learn.wd = wd

In [None]:
learn.lr_find()

In [None]:
learn.recorder.plot(skip_end=0)

Let's train our model!

In [None]:
cycle_len = 3

In [None]:
max_lr = 1e-2

In [None]:
lr = slice(max_lr/(2.6**4), max_lr)

In [None]:
learn.fit_one_cycle(cyc_len = cycle_len, 
                    max_lr = lr, # learning rate
                    div_factor = 20, # factor to discount from max
                    moms = (0.8, 0.7), # momentums
                    pct_start = 0.1, # where the peak is at 
                    wd = wd # weight decay
                    )

Once we have trained our LM, we can save it for use in transfer-learning scenario for other tasks, such as document classification, sentiment analysis, or anything else related to NLP. In particular, we want to save all the Neural Network **excluding** the last layer, used as dummy classification task (next word prediction) for building the proper language model.

In [None]:
learn.save_encoder('enc_lstm')
learn.save('model_lstm')

### 4. Verify the quality of the model

In [None]:
learn.load("model_lstm")

In [None]:
TEXT = "questo è"
N_WORDS = 15
N_SENTENCES = 5

In [None]:
print("\n".join(learn.predict(TEXT, N_WORDS, temperature=0.8) for _ in range(N_SENTENCES)))