In [6]:
# torch imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import matplotlib.pyplot as plt
from tqdm import tqdm

import requests
import os
import re
import collections
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers

torch.manual_seed(305)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

SMALL_ITERS = 1000
LARGE_ITERS = 2000
EVAL_ITERS = 100
CONTEXT_WINDOW_SIZE = 256

In [2]:
input_file_path = 'data/full_shakespeare.txt'

if not os.path.exists(input_file_path):
    data_url = 'https://cs.stanford.edu/people/karpathy/char-rnn/shakespeare_input.txt'
    with open(input_file_path, 'w') as f:
        f.write(requests.get(data_url).text)

with open(input_file_path, 'r') as f:
    data = f.read()
print(f"length of dataset in characters: {len(data):,}")

length of dataset in characters: 4,573,338


In [None]:
vocab_size = 2000

# Define a BPE model
tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
tokenizer.decoder = decoders.ByteLevel()

# Trainer with desired vocab size
trainer = trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["<unk>"])
# data should be an iterator over your text lines or documents
tokenizer.train_from_iterator([data], trainer=trainer)

# Encode and decode functions
def encode_bpe(text):
    return tokenizer.encode(text).ids

def decode_bpe(ids):
    return tokenizer.decode(ids)

# Example usage:
train_text = data[:int(len(data) * 0.9)]
val_text = data[int(len(data) * 0.9):]

train_tokens = encode_bpe(train_text)
val_tokens = encode_bpe(val_text)

import torch
train_data = torch.tensor(train_tokens)
val_data = torch.tensor(val_tokens)

print(f"train has {len(train_data):,} tokens")
print(f"val has {len(val_data):,} tokens")




train has 1,478,711 tokens
val has 163,429 tokens


In [30]:
# function for getting batches of data
def get_batch(split, context_window_size, device, batch_size=32):
    """
    generate a small batch of data of inputs x and targets y

    Args:
        split: 'train' or 'val'
        device: 'cpu' or 'cuda' (should be 'cuda' if available)
    """
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - context_window_size, (batch_size,))
    x = torch.stack([data[i:i+context_window_size] for i in ix])
    y = torch.stack([data[i+1:i+context_window_size+1] for i in ix])
    x = x.to(device)
    y = y.to(device)
    return x, y

# helper function for tracking loss during training
# given to you
@torch.no_grad()
def estimate_loss(model, eval_iters, context_window_size, device, use_focal_loss=False):
    """
    Args:
      model: model being evaluated
      eval_iters: number of batches to average over
      context_window_size: size of the context window
      device: 'cpu' or 'cuda' (should be 'cuda' if available)
    """
    out = {}
    for split in ['train', 'val']:
        losses_by_type = {
            'ce_loss': torch.zeros(eval_iters),
        }
        if use_focal_loss:
            losses_by_type['f_loss'] = torch.zeros(eval_iters)
        
        for k in range(eval_iters):
            X, Y = get_batch(split, context_window_size, device)
            logits, ce_loss, f_loss = model(X, Y)
            losses_by_type['ce_loss'][k] = ce_loss.item()
            if use_focal_loss:
                losses_by_type['f_loss'][k] = f_loss.item()

        out[split] = {'ce_loss': losses_by_type['ce_loss'].mean().item()}
        if use_focal_loss:
            out[split]['f_loss'] = losses_by_type['f_loss'].mean().item()

    return out

In [36]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size, context_window_size, embed_size=384):
        """
        Args:
          head_size: int, size of the head embedding dimension (K)
          context_window_size: int, number of tokens considered in the past for attention (T)
          embed_size: int, size of the token embedding dimension (D)
        """
        super().__init__()
        self.head_size = head_size
        self.key = nn.Linear(embed_size, head_size, bias=False)
        self.query = nn.Linear(embed_size, head_size, bias=False)
        self.value = nn.Linear(embed_size, embed_size, bias=False)

        # not a param of the model, so registered as a buffer
        self.register_buffer('tril', torch.tril(
            torch.ones(context_window_size, context_window_size)))

    def forward(self, x):
        """
        Args:
          x: (B,T,D) tensor of token embeddings

        Returns:
          (B,T,D) tensor of attention-weighted token embeddings
        """
        # TODO: your code here
        B, T, _ = x.shape
        K = self.head_size
        key = self.key(x)
        query = self.query(x)
        value = self.value(x)

        attn_scores = query@key.transpose(-2, -1)
        causal_mask = self.tril[:T, :T][None, :, :]
        attn_scores = attn_scores.masked_fill(causal_mask == 0, float('-inf'))
        attn_weights = torch.softmax(attn_scores / (K ** 0.5), dim=-1)
        return attn_weights@value

class SingleHeadedAttentionLM(nn.Module):

    def __init__(self, vocab_size, context_window_size, head_size, embed_size=384):
      """
      Args:
        vocab_size: int, size of the vocabulary (V)
        context_window_size: int, number of tokens considered in the past for attention (T)
        head_size: int, size of the head embedding dimension (K)
        embed_size: int, size of the token embedding dimension (D)
      """
      super().__init__()
      self.vocab_size = vocab_size
      self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
      self.position_embedding_table = nn.Embedding(context_window_size, embed_size)
      self.context_window_size = context_window_size

      # TODO: your code below
      self.atten_head = Head(head_size, context_window_size, embed_size)
      self.lm_head = nn.Linear(embed_size, vocab_size)

    def forward(self, token_ids, targets=None):
        """
        Args:
          token_ids: (B, T) token ids that make up the context (batch has size B, each entry
                     in the batch has length T)
          targets: (B, T) token ids corresponding to the target of each context in token_ids

        Returns:
          logits: (B, T, V) logits[b,t] gives the length V vector of logits for the next token
                   prediction in string b up to t tokens
          loss: scalar, negative log likelihood of target given context
        """
        B, T = token_ids.shape # (batch size, length)
        tok_emb = self.token_embedding_table(token_ids) # (B,T,D)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,D)
        x = tok_emb + pos_emb # (B,T,D)
        x = self.atten_head(x) # (B,T,D)
        logits = self.lm_head(x) # (B,T,V)

        # TODO: your code here
        logits = self.lm_head(x)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, self.vocab_size), targets.view(-1))

        return logits, loss

    @torch.no_grad()
    def generate(self, token_ids, max_new_tokens):
        """
        Args:
          token_ids: (B, T) tensor of token ids to provide as context
          max_new_tokens: int, maximum number of new tokens to generate

        Returns:
          (B, T+max_new_tokens) tensor of context with new tokens appended
        """
        #TODO
        # your code below
        B, T = token_ids.shape
        new_token_ids = token_ids.clone()
        for t in range(max_new_tokens):
            logits = self(new_token_ids)
            new_token = torch.multinomial(F.softmax(logits[:, -1, :], dim=-1), 1)
            new_token_ids = torch.cat([new_token_ids, new_token], dim=1)
        return new_token_ids

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, context_window_size, num_heads, head_size, embed_size=384):
        """
        Args:
            context_window_size: int, number of tokens considered in the past for attention (T)
            num_heads: int, number of heads (H)
            head_size: int, size of the head embedding dimension
            embed_size: int, size of the token embedding dimension
        """
        super().__init__()
        # TODO, your code below
        self.heads = nn.ModuleList([Head(head_size, context_window_size, embed_size) for _ in range(num_heads)])
        self.lm_head = nn.Linear(embed_size*num_heads, embed_size)
        self.num_heads = num_heads

    def forward(self, x):
        # TODO, your code below
        B, T, _ = x.shape
        head_size = x.shape[-1] // self.num_heads
        head_outputs = [head(x) for head in self.heads]
        head_outputs = torch.cat(head_outputs, dim=-1)
        head_outputs = head_outputs.view(B, T, -1)
        return self.lm_head(head_outputs)

class MultiHeadedAttentionLM(nn.Module):

    def __init__(self, vocab_size, context_window_size, embed_size=384, num_heads=6):
      super().__init__()
      self.head_size = embed_size // num_heads
      self.context_window_size = context_window_size
      # TODO: your code below
      self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
      self.position_embedding_table = nn.Embedding(context_window_size, embed_size)
      self.multi_head_attention = MultiHeadAttention(context_window_size, num_heads, self.head_size, embed_size)
      self.lm_head = nn.Linear(embed_size, vocab_size)
      self.vocab_size = vocab_size

    def forward(self, token_ids, targets=None):
        """
        Args:
          token_ids: (B, T) token ids that make up the context (batch has size B, each entry in the
                     batch has length T)
          targets: (B, T) token ids corresponding to the target of each context in token_ids

        Returns:
          logits: (B, T, V), logits[b,t] gives the length V vector of logits for the next token
                  prediction in string b up to t tokens
          loss: scalar, negative log likelihood of target given context
        """
        # TODO: your code below
        loss = None
        B, T = token_ids.shape
        tok_emb = self.token_embedding_table(token_ids)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.multi_head_attention(x)
        logits = self.lm_head(x)
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, self.vocab_size), targets.view(-1))
        return logits, loss

    @torch.no_grad()
    def generate(self, token_ids, max_new_tokens):
        """
        Args:
          token_ids: (B, T) tensor of token ids to provide as context
          max_new_tokens: int, maximum number of new tokens to generate

        Returns:
          (B, T+max_new_tokens) tensor of context with new tokens appended
        """
        # TODO: your code below
        for t in range(max_new_tokens):
            if token_ids.shape[1] > self.context_window_size:
                token_ids = token_ids[:, -self.context_window_size:]
            B, T = token_ids.shape
            logits, loss = self.forward(token_ids)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            new_token = torch.multinomial(probs, 1)
            token_ids = torch.cat([token_ids, new_token], dim=1)
        return token_ids

class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity
        Given to you, you don't need to write any code here!
    """

    def __init__(self, embed_size):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(embed_size, 4 * embed_size),
            nn.ReLU(),
            nn.Linear(4 * embed_size, embed_size),
        )

    def forward(self, x):
        return self.net(x)

class TransformerBlock(nn.Module):
    """ Transformer block: communication across sequence length, followed by communication across embedding space
        Uses multi-headed attention
    """

    def __init__(self, vocab_size, context_window_size, embed_size=384, num_heads=6):
        super().__init__()
        self.ln1 = nn.LayerNorm(embed_size)
        self.ln2 = nn.LayerNorm(embed_size)

        # TODO: your code below
        self.feed_forward = FeedForward(embed_size)
        self.atten_heads = MultiHeadAttention(context_window_size, num_heads, embed_size // num_heads, embed_size)

    def forward(self, x):
        x = x + self.atten_heads(self.ln1(x)) # communication over sequence length
        x = x + self.feed_forward(self.ln2(x)) # communication across embedding space
        return x

def focal_loss(logits, targets, gamma=2.0, alpha=1.0, reduction='mean'):
    ce_loss = F.cross_entropy(logits, targets, reduction='none')
    pt = torch.exp(-ce_loss)
    focal_factor = (1 - pt) ** gamma
    loss = alpha * focal_factor * ce_loss
    
    if reduction == 'mean':
        return loss.mean()
    elif reduction == 'sum':
        return loss.sum()
    else:
        return loss

class TransformerLM(nn.Module):

    def __init__(self, vocab_size, context_window_size, embed_size=384, num_heads=6, n_layers=6):
        """
          Args:
              vocab_size: int, number of tokens in the vocabulary (V)
              context_window_size: int, size of the context window (T)
              embed_size: int, embedding size (D)
              num_heads: int, number of heads (H)
              n_layers: int, number of layers (M)
        """
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, embed_size)
        self.position_embedding_table = nn.Embedding(context_window_size, embed_size)
        self.blocks = nn.Sequential(*[
            TransformerBlock(vocab_size,
                             context_window_size,
                             embed_size=embed_size,
                             num_heads=num_heads)
            for _ in range(n_layers)])

        # final layer norm
        self.ln_f = nn.LayerNorm(embed_size)
        self.lm_head = nn.Linear(embed_size, vocab_size)

        # good initialization
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, token_ids, targets=None):
        """
        Agrgs:
            token_ids: tensor of integers, provides the contet, shape (B, T)
            targets: tensor of integers, provides the tokens we are preidcitng, shape (B, T)
        """
        B, T = token_ids.shape

        # token_ids and targets are both (B, T) tensor of integers
        tok_emb = self.token_embedding_table(token_ids) # (B, T, D)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, D)
        x = tok_emb + pos_emb # (B, T, D)

        # TODO: your code below
        loss = None
        logits = self.blocks(x)
        logits = self.ln_f(logits)
        logits = self.lm_head(logits)
        if targets is not None:
            ce_loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), targets.view(-1))
            f_loss = focal_loss(logits.view(-1, logits.shape[-1]), targets.view(-1), gamma=0.5, alpha=1.0)
        return logits, ce_loss, f_loss

    @torch.no_grad()
    def generate(self, token_ids, max_new_tokens):
        """
        Args:
            token_ids: tensor of integers forming the context, shape (B, T)
            max_new_tokens: int, max number of tokens to generate
        """
        # TOOD, your code below
        self.eval()
        for _ in range(max_new_tokens):
            if token_ids.size(1) > CONTEXT_WINDOW_SIZE:
                token_ids = token_ids[:, -CONTEXT_WINDOW_SIZE:]
            logits, _, _ = self(token_ids)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1) 
            next_token = torch.multinomial(probs, num_samples=1)
            token_ids = torch.cat([token_ids, next_token], dim=1)
        self.train()
        return token_ids



In [37]:
trans = TransformerLM(vocab_size, CONTEXT_WINDOW_SIZE)
tlm = trans.to(device)
learning_rate = 1e-4
# TODO, your code below

optimizer = optim.Adam(tlm.parameters(), lr=learning_rate)

eval_interval = 200

In [None]:
f_loss_list = []
ce_loss_list = []
for it in tqdm(range(LARGE_ITERS)):
    # Evaluate
    if it % eval_interval == 0:
        losses = estimate_loss(tlm, EVAL_ITERS, CONTEXT_WINDOW_SIZE, device, use_focal_loss=True)
        print(f"step {it}: train loss {losses['train']}, val loss {losses['val']}")
    
    # Forward/backward/update
    xb, yb = get_batch('train', CONTEXT_WINDOW_SIZE, device, batch_size = 64)
    logits, ce_loss, f_loss = tlm(xb, yb)
    optimizer.zero_grad()
    ce_loss.backward()
    optimizer.step()
    f_loss_list.append(f_loss.item())
    ce_loss_list.append(ce_loss.item())

print("final CE training loss =", ce_loss_list[-1])
print("final F training loss =", f_loss_list[-1])

  0%|          | 1/2000 [00:06<3:41:50,  6.66s/it]

step 0: train loss {'ce_loss': 7.698077201843262, 'f_loss': 7.6962103843688965}, val loss {'ce_loss': 7.696474552154541, 'f_loss': 7.694606781005859}


 10%|█         | 201/2000 [00:49<1:03:53,  2.13s/it]

step 200: train loss {'ce_loss': 5.164602756500244, 'f_loss': 5.128971099853516}, val loss {'ce_loss': 5.232719898223877, 'f_loss': 5.1980133056640625}


 20%|██        | 401/2000 [01:31<56:03,  2.10s/it]  

step 400: train loss {'ce_loss': 4.694948196411133, 'f_loss': 4.65223503112793}, val loss {'ce_loss': 4.802484035491943, 'f_loss': 4.7613019943237305}


 30%|███       | 601/2000 [02:13<49:01,  2.10s/it]

step 600: train loss {'ce_loss': 4.389745712280273, 'f_loss': 4.34204626083374}, val loss {'ce_loss': 4.550459384918213, 'f_loss': 4.50554084777832}


 40%|████      | 801/2000 [02:56<42:00,  2.10s/it]

step 800: train loss {'ce_loss': 4.173600673675537, 'f_loss': 4.1227874755859375}, val loss {'ce_loss': 4.386159896850586, 'f_loss': 4.3393754959106445}


 50%|█████     | 1001/2000 [03:38<35:00,  2.10s/it]

step 1000: train loss {'ce_loss': 3.976705551147461, 'f_loss': 3.919642925262451}, val loss {'ce_loss': 4.241973876953125, 'f_loss': 4.190536975860596}


 60%|██████    | 1201/2000 [04:20<28:00,  2.10s/it]

step 1200: train loss {'ce_loss': 3.8132996559143066, 'f_loss': 3.7534191608428955}, val loss {'ce_loss': 4.129662990570068, 'f_loss': 4.076267242431641}


 70%|███████   | 1401/2000 [05:03<20:59,  2.10s/it]

step 1400: train loss {'ce_loss': 3.6882379055023193, 'f_loss': 3.6249749660491943}, val loss {'ce_loss': 4.036251544952393, 'f_loss': 3.9799365997314453}


 80%|████████  | 1601/2000 [05:45<13:58,  2.10s/it]

step 1600: train loss {'ce_loss': 3.575105667114258, 'f_loss': 3.5091559886932373}, val loss {'ce_loss': 3.9810876846313477, 'f_loss': 3.922952175140381}


 90%|█████████ | 1801/2000 [06:27<06:58,  2.10s/it]

step 1800: train loss {'ce_loss': 3.491826057434082, 'f_loss': 3.4230458736419678}, val loss {'ce_loss': 3.9159350395202637, 'f_loss': 3.8552422523498535}


100%|██████████| 2000/2000 [07:03<00:00,  4.72it/s]

final training loss = 3.664591073989868





In [39]:
estimate_loss(tlm, EVAL_ITERS, CONTEXT_WINDOW_SIZE, device, use_focal_loss=True)

{'train': {'ce_loss': 3.3895068168640137, 'f_loss': 3.319181203842163},
 'val': {'ce_loss': 3.8796470165252686, 'f_loss': 3.817950487136841}}

In [None]:
f_loss_list = []
ce_loss_list = []
for it in tqdm(range(LARGE_ITERS)):
    # Evaluate
    if it % eval_interval == 0:
        losses = estimate_loss(tlm, EVAL_ITERS, CONTEXT_WINDOW_SIZE, device, use_focal_loss=True)
        print(f"step {it}: train loss {losses['train']}, val loss {losses['val']}")
    
    # Forward/backward/update
    xb, yb = get_batch('train', CONTEXT_WINDOW_SIZE, device, batch_size = 64)
    logits, ce_loss, f_loss = tlm(xb, yb)
    optimizer.zero_grad()
    f_loss.backward()
    optimizer.step()
    f_loss_list.append(f_loss.item())
    ce_loss_list.append(ce_loss.item())

print("final CE training loss =", ce_loss_list[-1])
print("final F training loss =", f_loss_list[-1])

  0%|          | 1/2000 [00:06<3:44:11,  6.73s/it]

step 0: train loss {'ce_loss': 6.214792251586914, 'f_loss': 6.194438934326172}, val loss {'ce_loss': 6.249657154083252, 'f_loss': 6.229870796203613}


 10%|█         | 201/2000 [00:49<1:03:02,  2.10s/it]

step 200: train loss {'ce_loss': 5.032546520233154, 'f_loss': 4.993616104125977}, val loss {'ce_loss': 5.114448070526123, 'f_loss': 5.076656341552734}


 20%|██        | 401/2000 [01:31<56:35,  2.12s/it]  

step 400: train loss {'ce_loss': 4.642302989959717, 'f_loss': 4.595801830291748}, val loss {'ce_loss': 4.772821426391602, 'f_loss': 4.728541374206543}


 30%|███       | 601/2000 [02:13<49:05,  2.11s/it]

step 600: train loss {'ce_loss': 4.3406524658203125, 'f_loss': 4.291020393371582}, val loss {'ce_loss': 4.522536754608154, 'f_loss': 4.476134300231934}


 40%|████      | 801/2000 [02:55<41:59,  2.10s/it]

step 800: train loss {'ce_loss': 4.122776508331299, 'f_loss': 4.068578720092773}, val loss {'ce_loss': 4.346164703369141, 'f_loss': 4.296655178070068}


 50%|█████     | 1001/2000 [03:38<34:59,  2.10s/it]

step 1000: train loss {'ce_loss': 3.9153199195861816, 'f_loss': 3.854647159576416}, val loss {'ce_loss': 4.201709270477295, 'f_loss': 4.147448539733887}


 60%|██████    | 1201/2000 [04:20<27:59,  2.10s/it]

step 1200: train loss {'ce_loss': 3.7825870513916016, 'f_loss': 3.7182061672210693}, val loss {'ce_loss': 4.100528717041016, 'f_loss': 4.043321132659912}


 70%|███████   | 1401/2000 [05:02<20:58,  2.10s/it]

step 1400: train loss {'ce_loss': 3.6546132564544678, 'f_loss': 3.587021827697754}, val loss {'ce_loss': 4.020479202270508, 'f_loss': 3.9605300426483154}


 80%|████████  | 1601/2000 [05:44<14:06,  2.12s/it]

step 1600: train loss {'ce_loss': 3.538849115371704, 'f_loss': 3.4687914848327637}, val loss {'ce_loss': 3.9473252296447754, 'f_loss': 3.8854753971099854}


 90%|█████████ | 1801/2000 [06:27<06:57,  2.10s/it]

step 1800: train loss {'ce_loss': 3.4556074142456055, 'f_loss': 3.3836452960968018}, val loss {'ce_loss': 3.8955559730529785, 'f_loss': 3.8320443630218506}


100%|██████████| 2000/2000 [07:02<00:00,  4.73it/s]

final training loss = 3.664591073989868





In [40]:
start_context = torch.zeros((1, 1), dtype=torch.long, device=device)
uncond_gen = (tlm.generate(start_context, max_new_tokens=CONTEXT_WINDOW_SIZE)[0].tolist())

UnboundLocalError: cannot access local variable 'ce_loss' where it is not associated with a value

In [None]:
print(decode_bpe(uncond_gen))

old toow:--the whitive!
Come, marrylee: say 'tis done.
O, forswidder! I'll do back be dead
HappfVignicious; a soldier and night!' Go
What, of 'tis good
Hapsont kingable
Of last day, and soft. The king comes my sister
'ANDam Pidupusion. Corthy boys, good Since
Even Chiius, and S heartades daughter Wortiides my sad chawith
Turnought with him: the mashood am told me
The fatin of it. 'tis still wife, run in it and
geloilt on what this pel'd, Hector, toward his witness:
And they are half as two deeds willliarch!
He hath already's raidings too: this swa times
Some vain'd his own sound friend to fire;
And, most does you other to me know.

LUCENTIO:

CASSIO:
Kow your cel were she business.

LUCENTIO:
Let your other's sake to fear


In [13]:
xb, yb = get_batch(split='train', context_window_size=CONTEXT_WINDOW_SIZE, device=device)

In [21]:
decode_bpe(tlm.generate(xb[0:1], max_new_tokens=10).tolist()[0])

"They mean to warn us at Philippi here,\nAnswering before we do demand of them.\n\nANTONY:\nTut, I am in their bosoms, and I know\nWherefore they do it: they could be content\nTo visit other places; and come down\nWith fearful bravery, thinking by this face\nTo fasten in our thoughts that they have courage;\nBut 'tis not so.\n\nMessenger:\nPrepare you, generals:\nThe enemy comes on in gallant show;\nTheir bloody sign of battle is hung out,\nAnd something to be done immediately.\n\nANTONY:\nOctavius, lead your battle softly on,\nUpon the left hand of the even field.\n\nOCTAVIUS:\nUpon the right hand I; keep thou the left.\n\nANTONY:\nWhy do you cross me in this exigent?\n\nOCTAVIUS:\nI do not cross you; but I will do so.\n\nBRUTUS:\nThey stand, andiaught those strange su were indeed mother letter straight"