<a href="https://colab.research.google.com/github/darveenvijayan/nanoGPT/blob/master/nanoGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q torch numpy transformers datasets tiktoken wandb tqdm

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.1/519.1 kB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m71.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m100.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m69.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

# Get Data

In [None]:
import requests
import re
from bs4 import BeautifulSoup
from typing import Union
import json
import os
import logging


def get_meditations():
    """
    Imports the meditations by Marcus Aurelius.
    """
    # import Meditations by Marcus Aurelius
    response = requests.get('http://classics.mit.edu/Antoninus/meditations.mb.txt')
    data = response.text
    del response

    # remove everything before and including "Translated by George Long"
    data = data.split('Translated by George Long')[1]

    # remove "----" lines
    data = re.sub(r'([-])\1+', '', data)

    # remove "BOOK ..." lines, for this we use regular expressions
    data = re.sub('BOOK [A-Z]+\n', '', data)

    # remove "THE END" and all that follows it
    data = data.split("THE END")[0]

    # splitting by newline characters
    data = data.split('\n\n')

    # remove empty samples
    data = [x for x in data if x.replace('\s+', '') != '']

    # remove final '\n' characters
    data = [x.replace('\n', ' ') for x in data]

    print(f"We have {len(data)} stoic lessons from Marcus Aurelius")

    # strip any other whitespace and return
    data = [x.strip() for x in data]
    return data


def get_letters():
    """
    Imports 'Epistulae Morales Ad Lucilium' by Seneca
    """

    # import page containing links to all of Seneca's letters
    # get web address
    src = "https://en.wikisource.org/wiki/Moral_letters_to_Lucilius"

    html = requests.get(src).text  # pull html as text
    soup = BeautifulSoup(html, "html.parser")  # parse into BeautifulSoup object

    # create function to pull letter from webpage (pulls text within <p> elements
    def pull_letter(http):
        print(f"Pulling {http.split('/')[-1:][0]}")
        # get html from webpage given by 'http'
        html = requests.get(http).text
        # parse into a beautiful soup object
        soup = BeautifulSoup(html, "html.parser")

        # build text contents within all p elements
        txt = '\n'.join([x.text for x in soup.find_all('p')])
        # replace extended whitespace with single space
        txt = txt.replace('  ', ' ')
        # replace webpage references ('[1]', '[2]', etc)
        txt = re.sub('\[[0-9]+\]', '', txt)
        # replace all number bullet points that Seneca uses ('1.', '2.', etc)
        txt = re.sub('[0-9]+. ', '', txt)
        # split by double newlines
        lines = txt.split('\n\n')
        # strip and remove short lines
        lines = [x.strip() for x in lines if len(x.strip()) > 40]
        return lines

    # compile RegEx for finding 'Letter 12', 'Letter 104' etc
    letters_regex = re.compile("^Letter\s+[0-9]{1,3}$")
    # get all links
    links = soup.find_all('a')
    # initalize data
    letters = []
    # loop through all letter pages
    for link in links:
        # confirm we want this data
        if len(link.contents) > 0 and letters_regex.match(str(link.contents[0])):
            title = str(link.contents[0])
            href = link.get('href')
            # get text content from letter
            texts = pull_letter(f"https://en.wikisource.org{href}")
            # now we loop through and append the new texts
            for text in texts:
                letters.append({'title': title, 'href': href, 'text': text})
    return letters


meditations = get_meditations()
# letters = get_letters()

We have 507 stoic lessons from Marcus Aurelius


#Tokenizer logic

In [None]:
import nltk
nltk.download('punkt')

text = " ".join(meditations)
words = nltk.word_tokenize(text)
words.append(" ")

print(f"Length of words (characters): {len(words)}")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Length of words (characters): 50308


In [None]:
# Let's look at the first 1000 characters
print(words[:100])

['From', 'my', 'grandfather', 'Verus', 'I', 'learned', 'good', 'morals', 'and', 'the', 'government', 'of', 'my', 'temper', '.', 'From', 'the', 'reputation', 'and', 'remembrance', 'of', 'my', 'father', ',', 'modesty', 'and', 'a', 'manly', 'character', '.', 'From', 'my', 'mother', ',', 'piety', 'and', 'beneficence', ',', 'and', 'abstinence', ',', 'not', 'only', 'from', 'evil', 'deeds', ',', 'but', 'even', 'from', 'evil', 'thoughts', ';', 'and', 'further', ',', 'simplicity', 'in', 'my', 'way', 'of', 'living', ',', 'far', 'removed', 'from', 'the', 'habits', 'of', 'the', 'rich', '.', 'From', 'my', 'great-grandfather', ',', 'not', 'to', 'have', 'frequented', 'public', 'schools', ',', 'and', 'to', 'have', 'had', 'good', 'teachers', 'at', 'home', ',', 'and', 'to', 'know', 'that', 'on', 'such', 'things', 'a']


In [None]:
# Get all the unique words in corpus
set_words = sorted(list(set(words)))
vocab_size = len(set_words)
print("|".join(set_words))
print(vocab_size)

 |!|'|''|'s|'scape|(|)|,|-|.|...|:|;|?|A|About|Above|Accordingly|Accustom|Acquire|Adapt|Add|Adorn|Aesculapius|After|Again|Agathon|Agrippa|Alciphron|Alexander|All|Altogether|Always|Am|Among|An|And|Another|Antisthenes|Antoninus|Any|Apollo|Apollonius|Apply|Archimedes|Are|Areius|Art|As|Asia|At|Athenians|Athenodotus|Athens|Athos|Attend|Augustus|Avoid|Bacchius|Back|Baiae|Be|Because|Begin|Benedicta|Besides|Body|Book|Both|Bread|Brutus|But|By|Cadicianus|Caesar|Caeso|Caius|Camillus|Capreae|Carnuntum|Carry|Cast|Cato|Catullinus|Catulus|Cecrops|Celer|Certainly|Chaldaei|Charax|Chaurias|Christians|Chrysippus|Circus|Cithaeron|Clotho|Come|Confine|Conformably|Consequently|Consider|Constantly|Contemplate|Cosmos|Crates|Crito|Croesus|Cynic|Dear|Death|Demetrius|Democritus|Dialectic|Different|Diogenes|Diognetus|Dion|Diotimus|Direct|Divide|Do|Does|Domitius|Dost|Draw|Dwelling|Dye|Eighth|Either|Empedocles|Enough|Enter|Ephesians|Epictetus|Epicurus|Epitynchanus|Equanimity|Ethic|Eudaemon|Eudaemonia|Eudoxus|Euphrat

In [None]:
import itertools

# Tokenize the input text, we use a very simple approach
word2int = {}
for i,wo in enumerate(set_words):
  word2int[wo]=i

int2word = {}
for i,wo in enumerate(set_words):
  int2word[i]=wo

def encode(sent):
  ll = [[nltk.word_tokenize(w), ' '] for w in sent.split()]
  words = list(itertools.chain(*list(itertools.chain(*ll))))
  return [word2int[word] for word in words]

def decode(tokens):
  return "".join([int2word[token] for token in tokens])

print(encode("wasting grandfather!"))
print(decode(encode("wasting grandfather!")))

[4148, 0, 1994, 1, 0]
wasting grandfather! 


# Training

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")



def get_predictions(model,tokenizer,sentence):
    # Encode the sentence using the tokenizer and return the model predictions.
    inputs = tokenizer.encode(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(inputs)
        predictions = outputs[0]
    return predictions

def get_next_word_probabilities(model,tokenizer,sentence, top_k=500):

    # Get the model predictions for the sentence.
    predictions = get_predictions(model,tokenizer,sentence)

    # Get the next token candidates.
    next_token_candidates_tensor = predictions[0, -1, :]

    # Get the top k next token candidates.
    topk_candidates_indexes = torch.topk(
        next_token_candidates_tensor, top_k).indices.tolist()

    # Get the token probabilities for all candidates.
    all_candidates_probabilities = torch.nn.functional.softmax(
        next_token_candidates_tensor, dim=-1)

    # Filter the token probabilities for the top k candidates.
    topk_candidates_probabilities = \
        all_candidates_probabilities[topk_candidates_indexes].tolist()

    # Decode the top k candidates back to words.
    topk_candidates_tokens = \
        [tokenizer.decode([idx]).strip() for idx in topk_candidates_indexes]

    # Return the top k candidates and their probabilities.
    return list(zip(topk_candidates_tokens, topk_candidates_probabilities))


In [None]:
tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

In [None]:
print("num parameters : ",sum(t.numel() for t in model.parameters()))

num parameters :  124439808


In [None]:
sentence = "I enjoy walking in "

# Encode the sentence using the tokenizer and return the model predictions.
inputs = tokenizer.encode(sentence, return_tensors="pt")

# pass into model
outputs = model(inputs)
predictions = outputs[0]

predictions.shape

torch.Size([1, 5, 50257])

In [None]:

# Get the next token candidates.
next_token_candidates_tensor = predictions[0, -1, :]

# Get the top k next token candidates.
top_k=10
topk_candidates_indexes = torch.topk(
    next_token_candidates_tensor, top_k).indices.tolist()

# Get the token probabilities for all candidates.
all_candidates_probabilities = torch.nn.functional.softmax(
    next_token_candidates_tensor, dim=-1)

# Filter the token probabilities for the top k candidates.
topk_candidates_probabilities = \
    all_candidates_probabilities[topk_candidates_indexes].tolist()

# Decode the top k candidates back to words.
topk_candidates_tokens = \
    [tokenizer.decode([idx]).strip() for idx in topk_candidates_indexes]

# Return the top k candidates and their probabilities.
list(zip(topk_candidates_tokens, topk_candidates_probabilities))

[('', 0.2895686626434326),
 ('vern', 0.2207818478345871),
 ('iced', 0.1783422827720642),
 ('urch', 0.020242610946297646),
 ('________', 0.019794294610619545),
 ('____', 0.019121358171105385),
 ('urn', 0.015532216988503933),
 ('ike', 0.011696101166307926),
 ('�', 0.010702322237193584),
 ('irc', 0.010284041985869408)]

In [None]:
from tqdm.notebook import tqdm

# Now we train the model

batch_size = 32
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

with tqdm(range(10000)) as steps:

  for step in steps:
    steps.set_description(f"Loss {loss.item()}")

    # sample a batch of data


    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# print(loss.item())

  0%|          | 0/10000 [00:00<?, ?it/s]

NameError: ignored

In [None]:

x, y = get_batch("test")



print(x.shape,y.shape)

torch.Size([4, 8]) torch.Size([4, 8])


In [None]:
print(decode(m.generate(idx=val_data[:1], max_new_tokens=500)[0].tolist()))


IndexError: ignored