<a href="https://colab.research.google.com/github/ft-Azad/Language-Modeling/blob/main/LanguageModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#  Import Libs

In [None]:
!pip install -q torchmetrics
!pip install -q torchdata==0.6.1
!pip install 'portalocker>=2.0.0'
!pip install torchtext==0.15.1

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m868.8/868.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m49.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.0/21.0 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m849.3/849.3 kB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.1/557.1 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━

In [None]:
import numpy as np
import matplotlib.pyplot as plt

import torchtext
from torchtext.datasets import WikiText2
from torchdata.datapipes.iter import IterableWrapper, Mapper
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset

from torch import optim
from torch.nn import functional as F

import tqdm
import torchmetrics as tm

In [None]:
for lib in [np, torch, torchtext, tqdm]:
  print(lib.__name__, '-->', lib.__version__)

numpy --> 1.25.2
torch --> 2.0.0+cu117
torchtext --> 0.15.1+cpu
tqdm --> 4.66.4


# Utils

In [None]:
# !git clone https://github.com/ft-Azad/Language-Modeling

In [None]:
# %cd ..

In [None]:
# %ls

In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
def num_trainable_params(model):
  nums = sum(p.numel() for p in model.parameters() if p.requires_grad)/1e6
  return nums

#Dataset

- Loading Data

In [None]:
# train_iter, valid_iter, test_iter = WikiText2('/content/')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!unzip -q "/content/drive/MyDrive/LanguageModel/Data/wikitext-2-v1.zip" -d '/content/'

In [None]:
with open('/content/wikitext-2/wiki.train.tokens', 'r') as file:
    train_iter = file.read().splitlines()

with open('/content/wikitext-2/wiki.test.tokens', 'r') as file:
    test_iter = file.read().splitlines()

with open('/content/wikitext-2/wiki.valid.tokens', 'r') as file:
    valid_iter = file.read().splitlines()

train_iter = IterableWrapper(train_iter)
test_iter = IterableWrapper(test_iter)
valid_iter = IterableWrapper(valid_iter)

In [None]:
train_iter_ = iter(train_iter)
train_iter_

<generator object IterableWrapperIterDataPipe.__iter__ at 0x7c9d42c52570>

In [None]:
next(train_iter_)

' '

- Build Vocabulary

In [None]:
txt = ['@Azad hi Azad! 1 n2 3 #45', 'how are are you?']
tokenizer = get_tokenizer('basic_english')
[tokenizer(line) for line in txt]
list(map(tokenizer, txt))

[['@azad', 'hi', 'azad', '!', '1', 'n2', '3', '#45'],
 ['how', 'are', 'are', 'you', '?']]

In [None]:
vocab = build_vocab_from_iterator(map(tokenizer, txt), specials=['<ukn>'], min_freq=1)
vocab.set_default_index(vocab['<ukn>'])
vocab.get_stoi()

{'n2': 11,
 'how': 10,
 'you': 12,
 'hi': 9,
 'azad': 8,
 '@azad': 7,
 '?': 6,
 '3': 5,
 '1': 4,
 '#45': 3,
 '!': 2,
 'are': 1,
 '<ukn>': 0}

In [None]:
vocab(['hi', 'azad', '<ukn>', 'hello', 'Hi'.lower()])

[9, 8, 0, 0, 9]

In [None]:
tokenizer = get_tokenizer('basic_english')
vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])

In [None]:
len(vocab)

28782

In [None]:
torch.save(vocab, 'vocab.pt')
torch.save(vocab, '/content/drive/MyDrive/LanguageModel/vocab.pt')