### Load Dataset

### Read data

In [1]:
import pandas as pd

In [2]:
!ls /kaggle/input/shakespeare-plays

Shakespeare_data.csv  alllines.txt  william-shakespeare-black-silhouette.jpg


In [3]:
df = pd.read_csv('/kaggle/input/shakespeare-plays/Shakespeare_data.csv')
df = df.dropna() # drop lines which doesn't correspond to player, lines like "ACT I"
df = df.reset_index()

In [4]:
print(f'df.shape = {df.shape}')

first_test_elem = df.shape[0] * 0.85

train_df = df[df.index < first_test_elem]
test_df = df[df.index >= first_test_elem]

print(f'train_df.shape = {train_df.shape}')
print(f'test_df.shape = {test_df.shape}')

# simple idea just to consider data as a continius stream of player lines

train_df = train_df[['PlayerLine']]
test_df = test_df[['PlayerLine']]

def write_text(df, file_name):
    with open(file_name, 'w') as fout:
        fout.write(' '.join(df['PlayerLine'].tolist()))

write_text(train_df, 'train.txt')
write_text(test_df, 'test.txt')

df.shape = (105152, 7)
train_df.shape = (89380, 7)
test_df.shape = (15772, 7)


### Transformers baseline

In [5]:
!pip install transformers[torch] accelerate



Imports

In [6]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import math



Load model and tokenizer

In [7]:
# Load the pre-trained GPT-2 model and tokenizer
model_name = "gpt2"  # Choose an appropriate model size
# model_name = "garipovroma/gpt_2_shakespeare_finetuned-1"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Define train and val datasets

In [8]:
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='train.txt',  # Replace with the actual path
    block_size=400  # Adjust the block size as needed
)

val_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='test.txt',  # Replace with the actual path
    block_size=400  # Adjust the block size as needed
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)



Training args

In [9]:
# Fine-tuning arguments
training_args = TrainingArguments(
    output_dir="./shakespeare_finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,  # Adjust the number of epochs as needed
    per_device_train_batch_size=8,  # Adjust batch size as needed
    per_device_eval_batch_size=8,
    evaluation_strategy="steps",
    save_steps=50,
    eval_steps=10,
    logging_steps=10,  # Adjust logging frequency
    remove_unused_columns=False,  # Needed for custom metrics
#     save_total_limit=3,  # Limit the number of checkpoints saved
)



In [10]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Calculate perplexity
    loss = torch.nn.functional.cross_entropy(predictions.view(-1, predictions.size(-1)), labels.view(-1))
    perplexity = math.exp(loss.item())

    # Convert predictions and labels to lists of strings
    predictions = [str(np.argmax(pred)) for pred in predictions]
    labels = [str(label.item()) for label in labels]

    # Calculate BLEU score
    reference = [labels]  # Reference is a list of lists
    hypothesis = predictions
    bleu_score = sentence_bleu(reference, hypothesis, smoothing_function=SmoothingFunction().method1)

    return {"perplexity": perplexity, "bleu": bleu_score}

Trainer

In [11]:
# Create a Trainer for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
#     compute_metrics=compute_metrics
)

Run train loop

In [12]:
# Fine-tune the model
trainer.train()

# Save the model after training
model.save_pretrained("./shakespeare_finetuned")

# You can continue to generate text using the fine-tuned model as needed


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss,Validation Loss
10,4.6654,4.479091
20,4.5958,4.437127
30,4.4794,4.402711
40,4.4387,4.38535
50,4.4554,4.36745
60,4.4451,4.351386
70,4.4063,4.338783
80,4.4233,4.326476
90,4.3844,4.321807
100,4.3861,4.3123




In [13]:
8a9d0ec84480172dec73ed7432e692e34fe76fcf

SyntaxError: invalid decimal literal (1185902887.py, line 1)

In [15]:
model.push_to_hub("gpt_2_shakespeare_finetuned-2-400", use_auth_token="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")
tokenizer.push_to_hub("gpt_2_shakespeare_finetuned-2-400", use_auth_token="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")



pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]



CommitInfo(commit_url='https://huggingface.co/garipovroma/gpt_2_shakespeare_finetuned-2-400/commit/d562f8e87bb73613f5f81bd2f2471fd08bd59180', commit_message='Upload tokenizer', commit_description='', oid='d562f8e87bb73613f5f81bd2f2471fd08bd59180', pr_url=None, pr_revision=None, pr_num=None)

In [16]:
213

213

In [13]:
model.push_to_hub("gpt_2_shakespeare_finetuned-1", use_auth_token="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")
tokenizer.push_to_hub("gpt_2_shakespeare_finetuned-1", use_auth_token="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")



pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]



CommitInfo(commit_url='https://huggingface.co/garipovroma/gpt_2_shakespeare_finetuned-1/commit/c2f2c4023e225822a8cf1c2ec7be19e34f4fe0c8', commit_message='Upload tokenizer', commit_description='', oid='c2f2c4023e225822a8cf1c2ec7be19e34f4fe0c8', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# !zip shakespeare_finetuned_400_0.zip -r shakespeare_finetuned

In [13]:
!cd shakespeare_finetuned && ls

checkpoint-100	checkpoint-250	checkpoint-400	generation_config.json
checkpoint-150	checkpoint-300	checkpoint-50	pytorch_model.bin
checkpoint-200	checkpoint-350	config.json	runs


In [14]:
!zip shakespeare_finetuned.zip -r shakespeare_finetuned/

  adding: shakespeare_finetuned/ (stored 0%)
  adding: shakespeare_finetuned/checkpoint-250/ (stored 0%)
  adding: shakespeare_finetuned/checkpoint-250/scheduler.pt (deflated 49%)
  adding: shakespeare_finetuned/checkpoint-250/training_args.bin (deflated 49%)
  adding: shakespeare_finetuned/checkpoint-250/optimizer.pt (deflated 8%)
  adding: shakespeare_finetuned/checkpoint-250/generation_config.json (deflated 24%)
  adding: shakespeare_finetuned/checkpoint-250/rng_state.pth (deflated 28%)
  adding: shakespeare_finetuned/checkpoint-250/config.json (deflated 51%)
  adding: shakespeare_finetuned/checkpoint-250/pytorch_model.bin (deflated 7%)
  adding: shakespeare_finetuned/checkpoint-250/trainer_state.json (deflated 82%)
  adding: shakespeare_finetuned/checkpoint-300/ (stored 0%)
  adding: shakespeare_finetuned/checkpoint-300/scheduler.pt (deflated 48%)
  adding: shakespeare_finetuned/checkpoint-300/training_args.bin (deflated 49%)
  adding: shakespeare_finetuned/checkpoint-300/optimizer

In [15]:
!ls -lh

total 7.0M
-rw-r--r--  1 root root 447K Sep 26 10:09 cached_lm_GPT2Tokenizer_400_test.txt
-rw-r--r--  1 root root    0 Sep 26 10:09 cached_lm_GPT2Tokenizer_400_test.txt.lock
-rw-r--r--  1 root root 2.5M Sep 26 10:09 cached_lm_GPT2Tokenizer_400_train.txt
-rw-r--r--  1 root root    0 Sep 26 10:09 cached_lm_GPT2Tokenizer_400_train.txt.lock
drwxr-xr-x 11 root root 4.0K Sep 26 10:27 shakespeare_finetuned
-rw-r--r--  1 root root 615K Sep 26 10:08 test.txt
-rw-r--r--  1 root root 3.5M Sep 26 10:08 train.txt
drwxr-xr-x  3 root root 4.0K Sep 26 10:09 wandb


Thread WriterThread:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/internal_util.py", line 49, in run
    self._run()
  File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/internal_util.py", line 100, in _run
    self._process(record)
  File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/internal.py", line 380, in _process
    self._wm.write(record)
  File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/writer.py", line 154, in write
    write_handler(record)
  File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/writer.py", line 135, in _write
    self._write_record(record)
  File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/writer.py", line 109, in _write_record
    ret = self._ds.write(record)
  File "/opt/conda/lib/python3.10/site-packages/wandb/sdk/internal/datastore.py", line 289, in write
    ret = self._write_data(s)
  File "/opt/conda/lib/python3.10/site-pac

In [None]:
!ls

In [None]:
# tokenizer.push_to_hub("gpt_2_shakespeare_finetuned", use_auth_token="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")

In [None]:
# train_df

In [21]:
model = GPT2LMHeadModel.from_pretrained("shakespeare_finetuned/checkpoint-400/")

In [19]:
!ls -a shakespeare_finetuned/checkpoint-400

.	     generation_config.json  rng_state.pth	 training_args.bin
..	     optimizer.pt	     scheduler.pt
config.json  pytorch_model.bin	     trainer_state.json


In [22]:
# Move the model to CUDA (GPU) if available


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

from transformers import pipeline, set_seed
generator = pipeline('text-generation', model=model, tokenizer=tokenizer, device=device)
set_seed(42)
generator("To be or not to be", max_length=120, num_return_sequences=5)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "To be or not to be, no true soldier? So, if she be not worthy of the soldier's command, yet be worthy of him: here is another letter. Ay, yet I had rather than a little. Ay, yet I had rather than a dozen, than a soldier, yet a slave, yet a fool, yet a slave, yet a fool, yet a fool, yet a fool, yet a fool, yet a fool, yet a fool, yet a fool, yet a fool, yet a fool, yet a fool, yet a fool, yet a fool, yet"},
 {'generated_text': "To be or not to be? What means't you here for? What do you say of a man that will do her harm? Well I told her, I have to look after that you do. My lord, I am your father, good sir. Nay, I'll give you the truth of this matter, my father is your husband and I shall take no interest in her if he will not. What if he is the heir? Nay, we'll take no interest in him: he must lose the king with his wife and he shall be heir of that king. So do'ster"},
 {'generated_text': "To be or not to be: this cannot be. I have come so far, Lord Talbot, that I 

In [28]:
generator("The weather today is fine!", max_length=120, num_return_sequences=5)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "The weather today is fine! A good deal of this weather: some of it is that dares not touch the soil, and so no doubt comes from that the dusky sun bears a little light: It bears so little, indeed, in a pinch as is that much, being most heavy on one foot. When this news comes, that the world was no longer the world, let me tell my noble brother that this morning is well. Now do you know, my lord? I, my lady, here confess'd that the world had no soul to endure so much as this, so my"},
 {'generated_text': 'The weather today is fine! If we hold that peace, it is so, I am sure, we will make for York, and henceforth we shall be glad. But what say your peers! Come, your lordship, my lords, there is the clerk, who hath gone with us to the court: but, my lord, my noble lord, he hath not gone. Let it be known to you, gentleman, that, in this state of peril, I think the better way may be known. And to his good master, your noble lord, I fear you well: I am'},
 {'generated_te

In [29]:
len("The weather today is fine! A good deal of this weather: some of it is that dares not touch the soil, and so no doubt comes from that the dusky sun bears a little light: It bears so little, indeed, in a pinch as is that much, being most heavy on one foot. When this news comes, that the world was no longer the world, let me tell my noble brother that this morning is well. Now do you know, my lord? I, my lady, here confess'd that the world had no soul to endure so much as this, so my")

485