Using https://github.com/agrechnev/hugging_examples/blob/master/train_gpt2_torch1.py and some writeup at https://www.it-jim.com/blog/training-and-fine-tuning-gpt-2-and-gpt-3-models-using-hugging-face-transformers-and-openai-api/

In [1]:
!pip install git+https://github.com/huggingface/transformers wandb pandas torch 'transformers[torch]'

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-w_lxb8jc
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-w_lxb8jc
  Resolved https://github.com/huggingface/transformers to commit a6e6b1c622d8d08e2510a82cb6266d7b654f1cbf
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting wandb
  Downloading wandb-0.15.8-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers==4.32.0.dev0)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
Collec

In [2]:
import pandas as pd
def load_data(URL):
  quotes=pd.read_csv(URL,header=None)
  quotes=quotes.fillna('')
  return quotes
quotes=load_data("https://huggingface.co/datasets/Crapp/sadQuotes/raw/main/quotes.csv")

In [65]:
quote=quotes.iloc[:][1]
quote=quote.str.split('.').explode()
type(quote)
quote="\n".join(quote)

In [67]:
str2file = open("quotes.txt", "w")
str2file.write(quote)
str2file.close()
#quote.to_csv(r'quotes.txt', header=None, index=None, sep=' ', mode='w')

In [68]:
!head quotes.txt

What is happening in your Mind is not Reality; it is important to differentiate between the two

Once you cultivate Equanimity within, every cell in your body will respond by generating Sweetness

Life is a limited amount of Time and Energy
 Let us use it for maximum Impact

What is good for the soil is always good for your body because your body is just an embodiment of soil

Our intention is to make this Planet into a Temple where everyone walks with a certain Grace and reverence to Life


In [69]:
import sys
import numpy as np
import torch
import torch.utils.data
import transformers
import tqdm
MODEL_NAME = 'gpt2'
TEXT_CORPUS = 'quotes.txt'
DEVICE = 'cuda'
TOKEN_ENDOFTEXT = 50256  # '<|endoftext|>
BLOCK_LEN = 512

In [70]:
    model = transformers.GPT2LMHeadModel.from_pretrained(MODEL_NAME)
    tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)

In [71]:
    model.to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [72]:
def break_text_to_pieces(text_path: str, tokenizer: transformers.PreTrainedTokenizer, block_len: int = 512) -> list[str]:
    """Read a file and convert it to tokenized blocks, edding <|endoftext|> to each block"""
    with open(text_path) as f:
        text = f.read()
    chunk_len0 = block_len - 1  # Leave space for a TOKEN_ENDOFTEXT
    tokens = tokenizer.encode(text)
    blocks = []
    pos = 0
    while pos < len(tokens):
        chunk = tokens[pos: pos + chunk_len0]
        chunk.append(TOKEN_ENDOFTEXT)
        blocks.append(chunk)
        pos += chunk_len0

    if len(blocks[-1]) < block_len:
        del blocks[-1]

    return blocks

In [73]:
def prepare_dsets(text_path: str, tokenizer: transformers.PreTrainedTokenizer, block_len: int):
    """Read the text, prepare the datasets """
    data = break_text_to_pieces(text_path, tokenizer, block_len)
    data_train, data_val = train_val_split(data, 0.1)
    return MyDset(data_train), MyDset(data_val)

In [74]:
def train_val_split(data: list[str], ratio: float):
    n = len(data)
    assert n >= 2
    n_val = max(1, int(n * ratio))
    return data[n_val:], data[:n_val]

In [75]:
class MyDset(torch.utils.data.Dataset):
    """A custom dataset that serves 1024-token blocks as input_ids == labels"""
    def __init__(self, data: list[list[int]]):
        self.data = []
        for d in data:
            input_ids = torch.tensor(d, dtype=torch.int64)
            attention_mask = torch.ones(len(d), dtype=torch.int64)
            self.data.append({'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': input_ids})

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx: int):
        return self.data[idx]


In [77]:
    import locale
    locale.getpreferredencoding = lambda: "UTF-8"
    dset_train, dset_val = prepare_dsets(TEXT_CORPUS, tokenizer, BLOCK_LEN)
    loader_train = torch.utils.data.DataLoader(dset_train, batch_size=1)
    loader_val = torch.utils.data.DataLoader(dset_val, batch_size=1)

In [79]:
def train_one(model: torch.nn.Module, loader: torch.utils.data.DataLoader, optimizer: torch.optim.Optimizer):
    """Standard PyTorch training, one epoch"""
    model.train()
    losses = []
    for batch in tqdm.tqdm(loader):
        for k, v in batch.items():
            batch[k] = v.to(DEVICE)
        optimizer.zero_grad()
        out = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        # loss, logits, past_key_values
        loss = out['loss']
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    return np.mean(losses)

In [80]:
def val_one(model: torch.nn.Module, loader: torch.utils.data.DataLoader):
    """Standard PyTorch eval, one epoch"""
    model.eval()
    losses = []
    for batch in tqdm.tqdm(loader):
        for k, v in batch.items():
            batch[k] = v.to(DEVICE)
        with torch.no_grad():
            out = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        # loss, logits, past_key_values
        loss = out['loss']
        losses.append(loss.item())
    return np.mean(losses)

In [81]:
import wandb
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [87]:
wandb.init(
    project="sadGPT",
    config={
    "learning_rate": 1e-3,
    "architecture": "gpt2",
    "dataset": "https://huggingface.co/datasets/Crapp/sadQuotes/raw/main/quotes.csv",
    "epochs": 20,
    })

In [88]:
for epoch in range(20):
        loss_train = train_one(model, loader_train, optimizer)
        loss_val = val_one(model, loader_val)
        print(f'{epoch} : loss_train={loss_train}, loss_val={loss_val}')
        wandb.log({"loss_train": loss_train, "loss_val": loss_val})

100%|██████████| 135/135 [00:26<00:00,  5.14it/s]
100%|██████████| 14/14 [00:00<00:00, 18.60it/s]


0 : loss_train=0.11798471010945462, loss_val=5.875938756125314


100%|██████████| 135/135 [00:27<00:00,  4.98it/s]
100%|██████████| 14/14 [00:00<00:00, 19.19it/s]


1 : loss_train=0.11066545162487913, loss_val=5.8920718261173795


100%|██████████| 135/135 [00:26<00:00,  5.14it/s]
100%|██████████| 14/14 [00:00<00:00, 20.00it/s]


2 : loss_train=0.1105152759011145, loss_val=5.818116835185459


100%|██████████| 135/135 [00:26<00:00,  5.12it/s]
100%|██████████| 14/14 [00:00<00:00, 19.52it/s]


3 : loss_train=0.11273168915951694, loss_val=5.97877311706543


100%|██████████| 135/135 [00:26<00:00,  5.07it/s]
100%|██████████| 14/14 [00:00<00:00, 20.00it/s]


4 : loss_train=0.10116661519364074, loss_val=6.063339846474784


100%|██████████| 135/135 [00:26<00:00,  5.10it/s]
100%|██████████| 14/14 [00:00<00:00, 19.74it/s]


5 : loss_train=0.10057174577205269, loss_val=6.130635874611991


100%|██████████| 135/135 [00:26<00:00,  5.11it/s]
100%|██████████| 14/14 [00:00<00:00, 19.47it/s]


6 : loss_train=0.10709198848516853, loss_val=6.137974534715925


100%|██████████| 135/135 [00:26<00:00,  5.09it/s]
100%|██████████| 14/14 [00:00<00:00, 19.24it/s]


7 : loss_train=0.10492534209732655, loss_val=6.0036464759281705


100%|██████████| 135/135 [00:26<00:00,  5.09it/s]
100%|██████████| 14/14 [00:00<00:00, 19.28it/s]


8 : loss_train=0.10781721814914987, loss_val=5.948036432266235


100%|██████████| 135/135 [00:26<00:00,  5.10it/s]
100%|██████████| 14/14 [00:00<00:00, 19.70it/s]


9 : loss_train=0.10292802505471088, loss_val=6.065492766244071


100%|██████████| 135/135 [00:26<00:00,  5.10it/s]
100%|██████████| 14/14 [00:00<00:00, 19.60it/s]


10 : loss_train=0.0947672523006245, loss_val=5.894904204777309


100%|██████████| 135/135 [00:26<00:00,  5.09it/s]
100%|██████████| 14/14 [00:00<00:00, 19.43it/s]


11 : loss_train=0.09057616765300433, loss_val=5.903630903788975


100%|██████████| 135/135 [00:26<00:00,  5.08it/s]
100%|██████████| 14/14 [00:00<00:00, 19.76it/s]


12 : loss_train=0.09394439652010246, loss_val=6.159625632422311


100%|██████████| 135/135 [00:26<00:00,  5.09it/s]
100%|██████████| 14/14 [00:00<00:00, 19.15it/s]


13 : loss_train=0.09549259576532576, loss_val=5.984661476952689


100%|██████████| 135/135 [00:26<00:00,  5.10it/s]
100%|██████████| 14/14 [00:00<00:00, 19.22it/s]


14 : loss_train=0.0873871274292469, loss_val=6.17333367892674


100%|██████████| 135/135 [00:26<00:00,  5.09it/s]
100%|██████████| 14/14 [00:00<00:00, 19.75it/s]


15 : loss_train=0.09377681061073585, loss_val=6.156057255608695


100%|██████████| 135/135 [00:26<00:00,  5.09it/s]
100%|██████████| 14/14 [00:00<00:00, 19.24it/s]


16 : loss_train=0.10323801382824227, loss_val=6.163485527038574


100%|██████████| 135/135 [00:26<00:00,  5.09it/s]
100%|██████████| 14/14 [00:00<00:00, 19.51it/s]


17 : loss_train=0.0944123996766629, loss_val=6.115964242390224


100%|██████████| 135/135 [00:26<00:00,  5.08it/s]
100%|██████████| 14/14 [00:00<00:00, 19.47it/s]


18 : loss_train=0.08316223000486692, loss_val=6.0909091745104105


100%|██████████| 135/135 [00:26<00:00,  5.09it/s]
100%|██████████| 14/14 [00:00<00:00, 19.69it/s]

19 : loss_train=0.08470383475500125, loss_val=6.1462615217481344





In [92]:
    if loss_val <= loss_train:
        wandb.alert(
            title='Validation lower that Train?',
            text=f'Val {loss_val} is below the Train losss {loss_train}',
        )
        print('Alert triggered')

In [93]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
loss_train,█▇▆▇▅▄▆▅▆▅▃▂▃▃▂▃▅▃▁▁
loss_val,▂▂▁▄▆▇▇▅▄▆▃▃█▄███▇▆▇

0,1
loss_train,0.0847
loss_val,6.14626


In [95]:
        model.save_pretrained('./trained_model/')
        tokenizer.save_pretrained('./trained_model/')

('./trained_model/tokenizer_config.json',
 './trained_model/special_tokens_map.json',
 './trained_model/vocab.json',
 './trained_model/merges.txt',
 './trained_model/added_tokens.json',
 './trained_model/tokenizer.json')

In [148]:
    prompt = 'Purpose of Life?'
    #prompt = 'Purpose of sex?' #memorized!
    batch = tokenizer([prompt], return_tensors='pt')
    for k, v in batch.items():
        batch[k] = v.to(DEVICE)
    out = model.generate(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], max_length=50)
    outGen=tokenizer.batch_decode(out.cpu())
    print(outGen)
    outGenFirst = '. '.join(outGen[0].split('\n')[:-1])
    print(outGenFirst.strip(prompt))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['Purpose of Life?\nAre you doing it all the time\n But if you do it with involvement and pride, it’s a quality of involvement, not an act\n\nIn positive exchange, you give whatever you can without caring what']
. Are you doing it all the time.  But if you do it with involvement and pride, it’s a quality of involvement, not an act.


In [126]:
from transformers import pipeline
generator = pipeline('text-generation', model='gpt2')
def generate(text):
    result = generator(text, max_length=300, num_return_sequences=1)
    #result = generator(text, max_length=30, num_return_sequences=1)
    return result[0]["generated_text"]

In [128]:
generate(prompt)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"Purpose of Life? (1993)\n\nDangerous to Children, Not Kids(1989)\n\nDangerous To Adults, Not Children(1993)\n\nDevoted, Kind, Compassionate, Love a Lot(1991) Part 1(1992, 1993, 1994)(1993,1994)\n\nDogma Is Harmful, Loving Ourselves. (1991)\n\nDevoted, Kind, Compassionate, Love a Lot(1991) Part 2(1992)\n\nDevoted, Kind, Compassionate, Love a Lot(1991) Part 3(1992)\n (1994, 1994)\n\nDr. Lacey's Dictionary of Sex, Drugs, and Relationships(1991)\n\nDr. Lacey's Dictionary of Sex, Drugs, and Relationships(1991) Part 4(1993)\n\nDangerous to Children, Not Kids(1991)\n\nDangerous To Adults, Not Children(1993)\n\nDeeply Awkward, Kind, Compassionate, Like Love a Lot(1992)\n\nDevoted, Kind, Compassionate, Love a Lot(1992) Part 5(1992)\n\nDevoted, Kind, Compassionate, Love a Lot(1992) Part 6(1992)\n\nDeeply Awkward, Kind, Compassionate, Like Love a Lot(1992) Part 7(1992)\n\nDeeply Awkward, Kind, Compassionate"

In [134]:
!pip install huggingface_hub



In [135]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [138]:
model.push_to_hub('sadGPTwandb')

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Crapp/sadGPTwandb/commit/bf80955ada95bbfacd20fef4c4a44ddb5b99ddd6', commit_message='Upload model', commit_description='', oid='bf80955ada95bbfacd20fef4c4a44ddb5b99ddd6', pr_url=None, pr_revision=None, pr_num=None)

In [140]:
!ls -ltrh trained_model

total 478M
-rw-r--r-- 1 root root  912 Aug  6 19:48 config.json
-rw-r--r-- 1 root root  124 Aug  6 19:48 generation_config.json
-rw-r--r-- 1 root root 475M Aug  6 19:48 pytorch_model.bin
-rw-r--r-- 1 root root  234 Aug  6 19:48 tokenizer_config.json
-rw-r--r-- 1 root root   99 Aug  6 19:48 special_tokens_map.json
-rw-r--r-- 1 root root 780K Aug  6 19:48 vocab.json
-rw-r--r-- 1 root root 446K Aug  6 19:48 merges.txt
-rw-r--r-- 1 root root 2.1M Aug  6 19:48 tokenizer.json


In [142]:
tokenizer.push_to_hub('sadGPTwandb')

CommitInfo(commit_url='https://huggingface.co/Crapp/sadGPTwandb/commit/5ed506004a63aab9a84220b952cdf13fc6199472', commit_message='Upload tokenizer', commit_description='', oid='5ed506004a63aab9a84220b952cdf13fc6199472', pr_url=None, pr_revision=None, pr_num=None)

In [143]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="Crapp/sadGPTwandb")

Downloading (…)lve/main/config.json:   0%|          | 0.00/912 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [145]:
pipe(prompt)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Purpose of Life?\nEvery life is a possibility\n If you want to keep that possibility open, never ever form an opinion of any sort on anyone\n\nWhen you eat, you take in a part of the earth\n How we treat it'}]

In [146]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Crapp/sadGPTwandb")
model = AutoModelForCausalLM.from_pretrained("Crapp/sadGPTwandb")

In [149]:
    prompt = 'Purpose of Life?'
    #prompt = 'Purpose of sex?' #memorized!
    batch = tokenizer([prompt], return_tensors='pt')
    for k, v in batch.items():
        batch[k] = v#.to(DEVICE)
    out = model.generate(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], max_length=50)
    outGen=tokenizer.batch_decode(out.cpu())
    print(outGen)
    outGenFirst = '. '.join(outGen[0].split('\n')[:-1])
    print(outGenFirst.strip(prompt))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['Purpose of Life?\nAre you doing it all the time\n But if you do it with involvement and pride, it’s a quality of involvement, not an act\n\nIn positive exchange, you give whatever you can without caring what']
. Are you doing it all the time.  But if you do it with involvement and pride, it’s a quality of involvement, not an act.
