<a href="https://colab.research.google.com/github/daniloaleixo/PT-Language-Model/blob/master/PT_Language_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PT Language Model


Based on fast.ai course using ULMfit 

### Fist Configs

In [0]:
  !curl -s https://course.fast.ai/setup/colab | bash


Updating fastai...
Done.


In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)
root_dir = "/content/gdrive/My Drive/"
base_dir = root_dir + 'fastai-v3/'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


### Loading 


In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

from fastai import *
from fastai.text import *

In [0]:

# bs=48
# bs=24
bs=64

In [0]:
torch.cuda.set_device(0)

In [0]:
path = Path(base_dir + 'data/ptwiki')

In [0]:
lang = 'pt'

In [0]:
path.mkdir(exist_ok=True, parents=True)
lm_fns = [f'{lang}_wt', f'{lang}_wt_vocab']

In [8]:
path

PosixPath('/content/gdrive/My Drive/fastai-v3/data/ptwiki')

## Portuguese Wikipedia Model

### Download Data

Here we will download, split data into files and then tokenize it.

In [0]:
from fastai.basics import *
import re


def get_wiki(path,lang):
    name = f'{lang}wiki'
    if (path/name).exists():
        print(f"{path/name} already exists; not downloading")
        return

    xml_fn = f"{lang}wiki-latest-pages-articles.xml"
    zip_fn = f"{xml_fn}.bz2"

    if not (path/xml_fn).exists():
        print("downloading...")
        download_url(f'https://dumps.wikimedia.org/{name}/latest/{zip_fn}', path/zip_fn)
        print("unzipping...")
        bunzip(path/zip_fn)

    with working_directory(path):
        if not (path/'wikiextractor').exists(): os.system('git clone https://github.com/attardi/wikiextractor.git')
        print("extracting...")
        os.system("python wikiextractor/WikiExtractor.py --processes 4 --no_templates " +
            f"--min_text_length 1800 --filter_disambig_pages --log_file log -b 100G -q {xml_fn}")
    shutil.move(str(path/'text/AA/wiki_00'), str(path/name))
    shutil.rmtree(path/'text')


def split_wiki(path,lang):
    dest = path/'docs'
    name = f'{lang}wiki'
    if dest.exists():
        print(f"{dest} already exists; not splitting")
        return dest

    dest.mkdir(exist_ok=True, parents=True)
    title_re = re.compile(rf'<doc id="\d+" url="https://{lang}.wikipedia.org/wiki\?curid=\d+" title="([^"]+)">')
    lines = (path/name).open()
    f=None

    for i,l in enumerate(lines):
        if i%100000 == 0: print(i)
        if l.startswith('<doc id="'):
            title = title_re.findall(l)[0].replace('/','_')
            if len(title)>150: continue
            if f: f.close()
            f = (dest/f'{title}.txt').open('w')
        else: f.write(l)
    f.close()
    return dest

In [0]:
get_wiki(path,lang)


/content/gdrive/My Drive/fastai-v3/data/ptwiki/ptwiki already exists; not downloading


In [0]:
path.ls()

[PosixPath('/content/gdrive/My Drive/fastai-v3/data/ptwiki/ptwiki-latest-pages-articles.xml.bz2'),
 PosixPath('/content/gdrive/My Drive/fastai-v3/data/ptwiki/ptwiki-latest-pages-articles.xml'),
 PosixPath('/content/gdrive/My Drive/fastai-v3/data/ptwiki/wikiextractor'),
 PosixPath('/content/gdrive/My Drive/fastai-v3/data/ptwiki/log'),
 PosixPath('/content/gdrive/My Drive/fastai-v3/data/ptwiki/ptwiki')]

In [0]:
!head -n4 {path}/{name}

head: cannot open '{path}/{name}' for reading: No such file or directory


In [0]:
dest = split_wiki(path,lang)

0
100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000
2400000
2500000
2600000
2700000
2800000
2900000
3000000
3100000
3200000
3300000
3400000
3500000
3600000
3700000
3800000
3900000
4000000
4100000
4200000
4300000
4400000
4500000
4600000
4700000
4800000
4900000
5000000
5100000
5200000
5300000
5400000
5500000
5600000
5700000
5800000
5900000
6000000
6100000
6200000
6300000
6400000
6500000
6600000
6700000
6800000
6900000
7000000
7100000


In [0]:
dest.ls()[:5]

[PosixPath('/content/gdrive/My Drive/fastai-v3/data/ptwiki/docs/Astronomia.txt'),
 PosixPath('/content/gdrive/My Drive/fastai-v3/data/ptwiki/docs/América Latina.txt'),
 PosixPath('/content/gdrive/My Drive/fastai-v3/data/ptwiki/docs/Albino Forjaz de Sampaio.txt'),
 PosixPath('/content/gdrive/My Drive/fastai-v3/data/ptwiki/docs/Anno Domini.txt'),
 PosixPath('/content/gdrive/My Drive/fastai-v3/data/ptwiki/docs/Aquiles.txt')]

### Create pretrained model

#### Tokenize

As Google Colab does not handle tokenize because of RAM issues, I've ran it into my computer and then uploaded to Google drive

In [0]:
dest = "/content/gdrive/My Drive/fastai-v3/data/ptwiki/docs/"

In [0]:
data = (TextList.from_folder(dest)
            .split_by_rand_pct(0.1, seed=42)
            .label_for_lm()           
            .databunch(bs=bs, num_workers=1))

In [0]:
data.save(f'{lang}_databunch')
len(data.vocab.itos),len(data.train_ds)

(16, 1)

#### Create model

In [0]:
# Run to get the info from drive if is already there (RAM issues described above)
dest = "/content/gdrive/My Drive/fastai-v3/data/ptwiki/"

In [0]:
data = load_data(path, f'{lang}_databunch', bs=bs)

In [0]:
learn = language_model_learner(data, AWD_LSTM, drop_mult=0.5, pretrained=False).to_fp16()

In [0]:
lr = 1e-2
lr *= bs/48  # Scale learning rate by batch size

Not let's train it for 10 epochs and see how it's done

In [0]:
# 1
learn.unfreeze()
learn.fit_one_cycle(1, lr, moms=(0.8,0.7))
learn.save('fit_1')

epoch,train_loss,valid_loss,accuracy,time


In [0]:
# 2
learn.fit_one_cycle(1, lr, moms=(0.8,0.7))
learn.save('fit_2')

In [0]:
# 3
learn.fit_one_cycle(1, lr, moms=(0.8,0.7))
learn.save('fit_3')