# Codeathon 3, DS 6050, Justin Roberts (jrr4n)

## Setup

In [1]:
!pip install -q --upgrade keras-hub
!pip install -q --upgrade keras  # Upgrade to Keras 3.

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/644.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m634.9/644.1 kB[0m [31m20.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m644.1/644.1 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m99.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import os
import keras_hub
import keras
import tensorflow as tf

import tensorflow.data as tf_data
import tensorflow.strings as tf_strings

In [5]:
print("GPU is", "available" if tf.config.list_physical_devices('GPU') else "not available")

GPU is available


## Settings & hyperparameters

In [6]:
# Data
BATCH_SIZE = 64
MIN_STRING_LEN = 512  # Strings shorter than this will be discarded
SEQ_LEN = 128  # Length of training sequences, in tokens

# Model
EMBED_DIM = 256
FEED_FORWARD_DIM = 128
NUM_HEADS = 3
NUM_LAYERS = 2
VOCAB_SIZE = 5000  # Limits parameters in model.

# Training
EPOCHS = 5

# Inference
NUM_TOKENS_TO_GENERATE = 80

In [7]:
import os
import zipfile
import keras
# Get current working directory
cwd = os.getcwd()
# Download the dataset to the current working directory
file_path = keras.utils.get_file(
   fname="simplebooks.zip",
   origin="https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip",
   extract=False,  # Do not extract immediately
   cache_dir=cwd  # Save it in the current working directory
)
# Extract the zip file manually to the current working directory
with zipfile.ZipFile(file_path, 'r') as zip_ref:
   zip_ref.extractall(cwd)
# Now set the dataset directory based on your current working directory
dir = os.path.join(cwd, "simplebooks/")
# Load simplebooks-92 train set and filter out short lines.
raw_train_ds = (
   tf_data.TextLineDataset(dir + "simplebooks-92-raw/train.txt")
   .filter(lambda x: tf_strings.length(x) > MIN_STRING_LEN)
   .batch(BATCH_SIZE)
   .shuffle(buffer_size=256)
)

# Load simplebooks-92 validation set and filter out short lines.
raw_val_ds = (
   tf_data.TextLineDataset(dir + "simplebooks-92-raw/valid.txt")
   .filter(lambda x: tf_strings.length(x) > MIN_STRING_LEN)
   .batch(BATCH_SIZE)
)
print(f"Dataset extracted to: {dir}")

Downloading data from https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip
[1m282386239/282386239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 0us/step
Dataset extracted to: /content/simplebooks/


In [8]:
# Train tokenizer vocabulary
vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
    raw_train_ds,
    vocabulary_size=VOCAB_SIZE,
    lowercase=True,
    reserved_tokens=["[PAD]", "[UNK]", "[BOS]"],
)

In [9]:
tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    sequence_length=SEQ_LEN,
    lowercase=True,
)

In [10]:
# packer adds a start token
start_packer = keras_hub.layers.StartEndPacker(
    sequence_length=SEQ_LEN,
    start_value=tokenizer.token_to_id("[BOS]"),
)


def preprocess(inputs):
    outputs = tokenizer(inputs)
    features = start_packer(outputs)
    labels = outputs
    return features, labels


# Tokenize and split into train and label sequences.
train_ds = raw_train_ds.map(preprocess, num_parallel_calls=tf_data.AUTOTUNE).prefetch(
    tf_data.AUTOTUNE
)
val_ds = raw_val_ds.map(preprocess, num_parallel_calls=tf_data.AUTOTUNE).prefetch(
    tf_data.AUTOTUNE
)

In [11]:
inputs = keras.layers.Input(shape=(None,), dtype="int32")
# Embedding.
embedding_layer = keras_hub.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=SEQ_LEN,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)
x = embedding_layer(inputs)
# Transformer decoders.
for _ in range(NUM_LAYERS):
    decoder_layer = keras_hub.layers.TransformerDecoder(
        num_heads=NUM_HEADS,
        intermediate_dim=FEED_FORWARD_DIM,
    )
    x = decoder_layer(x)  # Giving one argument only skips cross-attention.
# Output.
outputs = keras.layers.Dense(VOCAB_SIZE)(x)
model = keras.Model(inputs=inputs, outputs=outputs)
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
perplexity = keras_hub.metrics.Perplexity(from_logits=True, mask_token_id=0)
model.compile(optimizer="adam", loss=loss_fn, metrics=[perplexity])

In [12]:
model.summary()

## Training

In [13]:
model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)

Epoch 1/5




   2443/Unknown [1m142s[0m 51ms/step - loss: 4.9906 - perplexity: 174.7564

  self.gen.throw(typ, value, traceback)


[1m2444/2444[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 53ms/step - loss: 4.9901 - perplexity: 174.6750 - val_loss: 4.1867 - val_perplexity: 65.9519
Epoch 2/5
[1m2444/2444[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 49ms/step - loss: 4.1755 - perplexity: 65.1365 - val_loss: 4.1095 - val_perplexity: 61.0230
Epoch 3/5
[1m2444/2444[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 49ms/step - loss: 4.0333 - perplexity: 56.4843 - val_loss: 4.0263 - val_perplexity: 56.0888
Epoch 4/5
[1m2444/2444[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 49ms/step - loss: 3.9607 - perplexity: 52.5219 - val_loss: 4.0093 - val_perplexity: 55.1708
Epoch 5/5
[1m2444/2444[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 49ms/step - loss: 3.9200 - perplexity: 50.4315 - val_loss: 3.9870 - val_perplexity: 54.0164


<keras.src.callbacks.history.History at 0x79f3c0284340>

## Inference

In [14]:
# The "packer" layers adds the [BOS] token for us.
prompt_tokens = start_packer(tokenizer([""]))
prompt_tokens

<tf.Tensor: shape=(1, 128), dtype=int32, numpy=
array([[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int32)>

In [15]:

def next(prompt, cache, index):
    logits = model(prompt)[:, index - 1, :]
    # Ignore hidden states for now; only needed for contrastive search.
    hidden_states = None
    return logits, hidden_states, cache


### Greedy search

In [16]:
sampler = keras_hub.samplers.GreedySampler()
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,  # Start sampling immediately after the [BOS] token.
)
txt = tokenizer.detokenize(output_tokens)
print(f"Greedy search generated text: \n{txt}\n")

Greedy search generated text: 
['[BOS] " i don \' t know , " said the doctor , " but i \' m afraid i \' m afraid i \' m going to do it . i \' m afraid i \' m going to do it . i \' m going to be a good place , and i \' m going to get a good deal of trouble . i \' m going to be a good place , and i \' m going to get a good deal of trouble . i \' m going to get a good deal of trouble . i \' m going to get a good deal of trouble . i \' m going to get a good deal of trouble . i \' m']



### Beam search

In [17]:
sampler = keras_hub.samplers.BeamSampler(num_beams=10)
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)
txt = tokenizer.detokenize(output_tokens)
print(f"Beam search generated text: \n{txt}\n")

Beam search generated text: 
['[BOS] " i don \' t know , " he said , " but i don \' t know what i can do . i don \' t know , but i don \' t know what to do . i don \' t know what to do , but i don \' t know . i don \' t know what to do , but i don \' t know what to do . i don \' t know what to do , but i don \' t know what to do . i don \' t know what to do , but i don \' t know . i don \' t know what to do . i don \' t know what to do .']



### Random search

In [18]:
sampler = keras_hub.samplers.RandomSampler()
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)
txt = tokenizer.detokenize(output_tokens)
print(f"Random search generated text: \n{txt}\n")

Random search generated text: 
['[BOS] " turn your sword well , " said lang question , chatterer by this time , for he makes up other way along until he could be drivenge or an inch of hab cavalry . of course he was so lucky for his determined experience with the sun . he was not prescitably able to follow him , and he finally got the whip on the rope , and potter was i in closet . and then he made up his mind to proceed from the whip . it was a pigeon with an understanding that he had two arrows when he bent and gloat on , a trip on any account']



### Top-K search

In [19]:
sampler = keras_hub.samplers.TopKSampler(k=10)
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)
txt = tokenizer.detokenize(output_tokens)
print(f"Top-K search generated text: \n{txt}\n")

Top-K search generated text: 
['[BOS] " yes , " said mr . brenton ; " but you \' re not the least doubt of you , and it is a bad habitual to say goodby , " " " and a few minutes later he had a good time . " he will go to the house . but he \' ll have a good time when he was to wait for a few minutes . you know that he was not a very long , and it had been a chapter , when he saw the young man standing at the edge of the forest , and he had to wait for some time , for he had seen them , and his father had']



### Top-P search

In [20]:
sampler = keras_hub.samplers.TopPSampler(p=0.5)
output_tokens = sampler(
    next=next,
    prompt=prompt_tokens,
    index=1,
)
txt = tokenizer.detokenize(output_tokens)
print(f"Top-P search generated text: \n{txt}\n")

Top-P search generated text: 
['[BOS] but the esther sparks of flanders , who was conquered and put to death , had been in the battle of the american army . the army had been gathered about the army and marched to england . the french army , as the king , the prince , who had arrived at calcutta , had sent a messenger to him . the duke of seccano , and the news that the whole of the english had to leave the country , and had been received with great force . the french were now seized with an army , and as the spaniards , as it was , they']



### Using callbacks for text generation

In [21]:

class TopKTextGenerator(keras.callbacks.Callback):
    """A callback to generate text from a trained model using top-k."""

    def __init__(self, k):
        self.sampler = keras_hub.samplers.TopKSampler(k)

    def on_epoch_end(self, epoch, logs=None):
        output_tokens = self.sampler(
            next=next,
            prompt=prompt_tokens,
            index=1,
        )
        txt = tokenizer.detokenize(output_tokens)
        print(f"Top-K search generated text: \n{txt}\n")


text_generation_callback = TopKTextGenerator(k=10)
# Dummy training loop to demonstrate callback.
model.fit(train_ds.take(1), verbose=2, epochs=2, callbacks=[text_generation_callback])

Epoch 1/2
Top-K search generated text: 
['[BOS] " i was not so much afraid ; he thought it was very strange and very strange to me . i was afraid , but it was very strange and very strange to him i had , but there was something in my mind . i told him that he was a good man , and he would not like to speak to him . then he said : that is that he was going to tell me . then he told me to be a boy , and i would not like him . and when the old man was angry , he had to do his work and he wanted to take a good care of his own . but the boy was so frightened']

1/1 - 13s - 13s/step - loss: 3.6836 - perplexity: 39.8201
Epoch 2/2
Top-K search generated text: 
['[BOS] " you see , and , " he said , " we are going to be a little more than a hundred times , when a collection of a large sparition of concealing his country and his friends to the capital , and there are few of those who live in apology , for you to live in the same country . you remember that your uncle has got a very good many thin

<keras.src.callbacks.history.History at 0x79f342bb1f90>

#**Experimenting with GPT2**

In [1]:
!git clone https://github.com/openai/gpt-2.git

Cloning into 'gpt-2'...
remote: Enumerating objects: 239, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 239 (delta 1), reused 1 (delta 0), pack-reused 233 (from 1)[K
Receiving objects: 100% (239/239), 4.38 MiB | 18.24 MiB/s, done.
Resolving deltas: 100% (125/125), done.


In [1]:
%cd gpt-2

/content/gpt-2


In [2]:
# after running once hash out, change the directory in the above line then run the line below this cell to install the requirements file
#!pip install -r requirements.txt --force-reinstall

Collecting fire>=0.1.3 (from -r requirements.txt (line 1))
  Using cached fire-0.7.0-py3-none-any.whl
Collecting regex==2017.4.5 (from -r requirements.txt (line 2))
  Using cached regex-2017.4.5-cp310-cp310-linux_x86_64.whl
Collecting requests==2.21.0 (from -r requirements.txt (line 3))
  Using cached requests-2.21.0-py2.py3-none-any.whl.metadata (5.5 kB)
Collecting tqdm==4.31.1 (from -r requirements.txt (line 4))
  Using cached tqdm-4.31.1-py2.py3-none-any.whl.metadata (38 kB)
Collecting chardet<3.1.0,>=3.0.2 (from requests==2.21.0->-r requirements.txt (line 3))
  Using cached chardet-3.0.4-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting idna<2.9,>=2.5 (from requests==2.21.0->-r requirements.txt (line 3))
  Using cached idna-2.8-py2.py3-none-any.whl.metadata (8.9 kB)
Collecting urllib3<1.25,>=1.21.1 (from requests==2.21.0->-r requirements.txt (line 3))
  Using cached urllib3-1.24.3-py2.py3-none-any.whl.metadata (36 kB)
Collecting certifi>=2017.4.17 (from requests==2.21.0->-r require

In [2]:
!pip install -r requirements.txt



In [3]:
!python3 download_model.py 117M

Fetching checkpoint: 1.00kit [00:00, 517kit/s]                                                      
Fetching encoder.json: 1.04Mit [00:01, 566kit/s]                                                    
Fetching hparams.json: 1.00kit [00:00, 538kit/s]                                                    
Fetching model.ckpt.data-00000-of-00001: 498Mit [01:17, 6.40Mit/s]                                  
Fetching model.ckpt.index: 6.00kit [00:00, 4.94Mit/s]                                               
Fetching model.ckpt.meta: 472kit [00:01, 283kit/s]                                                  
Fetching vocab.bpe: 457kit [00:01, 316kit/s]                                                        


In [4]:
!pip install gpt-2-simple

Collecting gpt-2-simple
  Downloading gpt_2_simple-0.8.1.tar.gz (26 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting toposort (from gpt-2-simple)
  Downloading toposort-1.10-py3-none-any.whl.metadata (4.1 kB)
Downloading toposort-1.10-py3-none-any.whl (8.5 kB)
Building wheels for collected packages: gpt-2-simple
  Building wheel for gpt-2-simple (setup.py) ... [?25l[?25hdone
  Created wheel for gpt-2-simple: filename=gpt_2_simple-0.8.1-py3-none-any.whl size=24556 sha256=9168c0a712452a409fdb955b361c21fb7f213ef013f4eca12f6306dcdafcd4de
  Stored in directory: /root/.cache/pip/wheels/df/6a/fe/10d3223f78d1ac3e4c83bb4c5e2d28dfb1789c2fb4cc7ea8d0
Successfully built gpt-2-simple
Installing collected packages: toposort, gpt-2-simple
Successfully installed gpt-2-simple-0.8.1 toposort-1.10


In [2]:
!ls models/117M


checkpoint    hparams.json		      model.ckpt.index	vocab.bpe
encoder.json  model.ckpt.data-00000-of-00001  model.ckpt.meta


In [3]:
!cat models/117M/checkpoint

model_checkpoint_path: "model.ckpt"
all_model_checkpoint_paths: "model.ckpt"


In [4]:
import gpt_2_simple as gpt2
import tensorflow as tf
import os

In [5]:
sess = gpt2.start_tf_sess()

In [6]:
model_dir = 'models'
gpt2.load_gpt2(sess, model_name='117M', checkpoint_dir=model_dir)


Loading pretrained model models/117M/model.ckpt


In [7]:
gpt2.generate(sess, run_name='117M', checkpoint_dir=model_dir)

The problem is, may it not be a problem at all.

The big problem? We're all confused about what goes on inside of a car. Why is it so hard to get a driver's license when you can get a driver's license at a DMV? Why does it take two years for a car to get going?

Well, it's a little harder to get a driver's license than it is to get an auto insurance policy. But if you're living in Missouri, you're not going to be stuck with a $1,000 car insurance policy.

The problem is, it's not just cost.

Many people are seeing this as a problem, and it's time for us to start talking about it.

I have a problem with the idea that someone should pay for a car insurance policy.

I have a problem with the idea that people should be allowed to choose between a car insurance policy and driving privileges, and that's not a good idea.

What would happen if some of us decided to get our own car insurance?

I would lose my job, my ability to drive a car, my health, my freedom to drive a car.

I would lose my

In [8]:
gpt2.generate(sess,
              run_name='117M',
              checkpoint_dir=model_dir,
              length=100,          # Number of tokens in the generated text
              temperature=0.7,     # Controls creativity (lower = more focused, higher = more creative)
              top_k=40,            # Limits sampling to the top-k tokens (for coherence)
              top_p=0.9,           # Cumulative probability to control coherence and diversity
              prefix="Once upon a time",  # Text prompt to start the generation
              nsamples=3,          # Number of samples to generate
              batch_size=1,        # Number of samples to generate in parallel
              return_as_list=True  # If True, returns the output as a list
              )

["Once upon a time, you could think of the other planet as the world of the sun. The planet was never more than a satellite of the Sun. That's why we see it as a ring of stars. When the sun went out, it didn't go in and out.\n\nNow, to the question: Why is the universe so small? Because the universe is so small. And it's easy to say that. If you look at the stars, the stars are large. But if you look at",
 "Once upon a time, the humans had all of the energy of a giant, but they were all too close to the giants' homes. The humans were about to become the world's first humans, and when the Humans stopped the giant, they had to wait for the Humans to return.\n\nThe humans could not get through this. As they had seen from the humans, the humans had no choice but to wait. They would have to make it back to the humans, but they were still going to have to",
 'Once upon a time, the street was still and even now, it was a place of constant walking and walking.\n\nIt was a place where the sun s

In [10]:
gpt2.generate(sess,
              run_name='117M',
              checkpoint_dir=model_dir,
              length=50,          # Number of tokens in the generated text
              temperature=0.2,     # Controls creativity (lower = more focused, higher = more creative)
              top_k=50,            # Limits sampling to the top-k tokens (for coherence)
              top_p=0.9,           # Cumulative probability to control coherence and diversity
              prefix="I love data science because",  # Text prompt to start the generation
              nsamples=3,          # Number of samples to generate
              batch_size=1,        # Number of samples to generate in parallel
              return_as_list=False  # If True, returns the output as a list
              )

I love data science because it's so easy to understand and understand. It's a lot easier to understand than to understand the data.

I love data science because it's so easy to understand and understand. It's a lot easier to understand than to understand the data
I love data science because it's easy to understand and it's easy to understand.

I love data science because it's easy to understand and it's easy to understand.

I love data science because it's easy to understand and it's easy to understand.
I love data science because it's easy to understand and it's easy to understand how to use it. But I'm not sure that it's the right way to do it.

I think that the best way to do it is to have a database of all the


### Fine Tuning GPT2

In [13]:
import requests

url = "https://www.gutenberg.org/files/84/84-0.txt"  # Example: Frankenstein by Mary Shelley
response = requests.get(url)

Frankenstein_book = response.text


In [14]:
with open('fine_tuning_data.txt', 'w') as f:
    f.write(Frankenstein_book)

In [34]:
tf.compat.v1.reset_default_graph()
finetune_sess = gpt2.start_tf_sess()
gpt2.load_gpt2(finetune_sess, model_name='117M', checkpoint_dir=model_dir)


Loading pretrained model models/117M/model.ckpt


In [35]:
gpt2.finetune(finetune_sess,
               'fine_tuning_data.txt',
               model_name='117M',
               steps=1000,
               restore_from='latest',  # or 'latest' if applicable
               run_name='unique_run_name',
               print_every=10,
               sample_every=200,
               save_every=500,
               reuse=True)  # Add reuse=True here


Loading checkpoint models/117M/model.ckpt


  0%|                                                                        | 0/1 [00:00<?, ?it/s]

Loading dataset...


100%|████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.36it/s]


dataset has 106176 tokens
Training...
[10 | 25.98] loss=3.22 avg=3.22
[20 | 48.59] loss=2.67 avg=2.94
[30 | 71.37] loss=3.20 avg=3.03
[40 | 93.85] loss=2.77 avg=2.96
[50 | 116.32] loss=3.07 avg=2.99
[60 | 138.91] loss=2.69 avg=2.93
[70 | 161.53] loss=2.88 avg=2.93
[80 | 184.10] loss=2.51 avg=2.87
[90 | 206.65] loss=2.52 avg=2.83
[100 | 229.16] loss=2.64 avg=2.81
[110 | 251.70] loss=2.45 avg=2.78
[120 | 274.25] loss=2.28 avg=2.73
[130 | 296.81] loss=2.03 avg=2.68
[140 | 319.37] loss=2.24 avg=2.64
[150 | 341.94] loss=1.77 avg=2.58
[160 | 364.50] loss=1.37 avg=2.50
[170 | 387.05] loss=1.58 avg=2.44
[180 | 409.60] loss=1.53 avg=2.39
[190 | 432.16] loss=1.54 avg=2.34
[200 | 454.74] loss=1.58 avg=2.30
 of our youth, and
they have suffered greatly from my neglect. But, dear Victor,
know me not by the title, or rather the habitation of my
machinations, and do not you suppose that I took delight in my
disposition?”





August 7th, 17—.


My dear girl, I never could so lamentably well as to lea

Instructions for updating:
Use standard file APIs to delete files with this prefix.


In [36]:
prompt = "It was a cold dark night"

gpt2.generate(finetune_sess,
               prefix=prompt,
               length=100,
               temperature=0.7,
               top_k=40,
               top_p=0.9,
               run_name='unique_run_name')

It was a cold dark night, and I felt as if I had
been strangled to death. The bolt was aimed at the shoulder, and I heard it
as I sat motionless on the ground, my eyes still fixed on the pistol which was
markeding my neck. The bolt sank, and I beheld a fiend within
my wing; a wretch who had just struck me with his grasp, I dashed with
pain to the ground, and heaped scorn and gnashing of teeth on every


## Experimenting with a 2nd Pre-trained model: Bert

In [1]:
!pip install transformers



In [3]:
!pip install transformers datasets scikit-learn


Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset

# Load a subset of the IMDb dataset
dataset = load_dataset("imdb")

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=1,
    report_to='none',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'].shuffle(seed=42).select(range(2000)),
    eval_dataset=tokenized_datasets['test'].shuffle(seed=42).select(range(500)),
)

trainer.train()

trainer.evaluate()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss
1,No log,0.366503


{'eval_loss': 0.3665033280849457,
 'eval_runtime': 216.8379,
 'eval_samples_per_second': 2.306,
 'eval_steps_per_second': 0.291,
 'epoch': 1.0}

In [3]:
import torch

test_samples = dataset['test']['text'][:5]

inputs = tokenizer(test_samples, padding='max_length', truncation=True, max_length=128, return_tensors="pt")

model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

predictions = torch.nn.functional.softmax(logits, dim=-1)
predicted_labels = torch.argmax(predictions, dim=1)

for i, text in enumerate(test_samples):
    print(f"\nReview: {text}")
    print(f"Predicted label: {'Positive' if predicted_labels[i] == 1 else 'Negative'}")
    print(f"Confidence scores: {predictions[i]}")



Review: I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have