In [1]:
# Props to this sensei
# https://www.youtube.com/watch?v=kCc8FmEb1nY&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=8

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from torch.autograd import Variable
import torch.nn.functional as F


from tqdm import tqdm # progress bar

device = ("cuda" if torch.cuda.is_available() else "mps"
          if torch.backends.mps.is_available() else "cpu")
print(f"Using {device} device")

Using cpu device


## Hyper-parameters

In [50]:
text_file = "tiny-shakespeare.txt"
batch_size = 4 # how many blocks will be given to model
block_size = 8 # context length



## Reading Data

In [4]:
# read file
with open(text_file, "r") as f:
    text = f.read()
text[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [5]:
char_list = sorted(list(set(text)))
char_size = len(char_list)
print(f"All the characters in the text: {''.join(char_list)}")
print(f"Length of the characters: {char_size}")

All the characters in the text: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Length of the characters: 65


## Tokenizer (character based, index/ascii)

In [6]:
class MyTokenizer:
    def __init__(self):
        self.char_to_index = None
        self.index_to_char = None

    def fit(self, char_list):  
        self.char_to_index = {char: idx for idx, char in enumerate(char_list)}
        self.index_to_char = {idx: char for char, idx in self.char_to_index.items()}

    def encode_index(self, input_str):
        return [self.char_to_index[char] for char in input_str]

    def decode_index(self, encoded_list):
        return ''.join([self.index_to_char[idx] for idx in encoded_list])

    @staticmethod
    def ascii_tokenizer(char):
        return ord(char)

    @staticmethod
    def ascii_decoder(ascii_value):
        return chr(ascii_value)

    def encode_combined(self, input_str, use_ascii=False):
        if use_ascii:
            return [self.ascii_tokenizer(char) for char in input_str]
        else:
            return self.encode_index(input_str)

    def decode_combined(self, encoded_list, use_ascii=False):
        if use_ascii:
            return ''.join([self.ascii_decoder(ascii_value) for ascii_value in encoded_list])
        else:
            return self.decode_index(encoded_list)

In [7]:
# Example usage:
tokenizer = MyTokenizer()
tokenizer.fit(char_list)

input_str = "Hello there"
encoded_list_ascii = tokenizer.encode_combined(input_str, use_ascii=True)
decoded_str_ascii = tokenizer.decode_combined(encoded_list_ascii, use_ascii=True)

encoded_list_index = tokenizer.encode_combined(input_str, use_ascii=True)
decoded_str_index = tokenizer.decode_combined(encoded_list_index, use_ascii=True)

print("Original String:", input_str)
print("Encoded List (ASCII):", encoded_list_ascii)
print("Decoded String (ASCII):", decoded_str_ascii)

print("Encoded List (Index):", encoded_list_index)
print("Decoded String (Index):", decoded_str_index)

Original String: Hello there
Encoded List (ASCII): [72, 101, 108, 108, 111, 32, 116, 104, 101, 114, 101]
Decoded String (ASCII): Hello there
Encoded List (Index): [72, 101, 108, 108, 111, 32, 116, 104, 101, 114, 101]
Decoded String (Index): Hello there


In [42]:
# Encode all the data and split into train and val
encoded_data = torch.tensor(tokenizer.encode_combined(text))
n = int( 0.9 * len(encoded_data))

train_data = encoded_data[:n]
val_data = data[n:]

train_data[:block_size + 1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [51]:
x = train_data[0:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    
    print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [52]:
# batching
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y 

xb, yb = get_batch("train") # xb -> input to the transformer
print("inputs: ")
print(xb.shape)
print(xb)

print("targets: ")
print(yb.shape)
print(yb)

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs: 
torch.Size([4, 8])
tensor([[52, 45, 57, 58,  1, 58, 46, 43],
        [47, 52, 49,  6,  0, 18, 53, 56],
        [35, 43, 50, 41, 53, 51, 43,  6],
        [25, 28, 17, 37, 10,  0, 20, 43]])
targets: 
torch.Size([4, 8])
tensor([[45, 57, 58,  1, 58, 46, 43, 51],
        [52, 49,  6,  0, 18, 53, 56,  1],
        [43, 50, 41, 53, 51, 43,  6,  1],
        [28, 17, 37, 10,  0, 20, 43,  1]])
when input is [52] the target: 45
when input is [52, 45] the target: 57
when input is [52, 45, 57] the target: 58
when input is [52, 45, 57, 58] the target: 1
when input is [52, 45, 57, 58, 1] the target: 58
when input is [52, 45, 57, 58, 1, 58] the target: 46
when input is [52, 45, 57, 58, 1, 58, 46] the target: 43
when input is [52, 45, 57, 58, 1, 58, 46, 43] the target: 51
when input is [47] the target: 52
when input is [47, 52] the target: 49
when input is [47, 52, 49] the target: 6
when input is [47, 52, 49, 6] the target: 0
when input is [47, 52, 49, 6, 0] the target: 18
when input is [47, 52

## Bigram Language Model

In [59]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(char_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(tokenizer.decode_combined(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


torch.Size([32, 65])
tensor(4.7353, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


## Data Loader

In [9]:
data = torch.tensor(encoded_data)
data.shape[0]

1115393

In [22]:
class MyDataset(Dataset):
    def __init__(self, encoded_data):
        self.encoded_data = encoded_data

    def __len__(self):
        return len(self.encoded_data)

    def __getitem__(self, idx):
        return self.encoded_data[idx]
        
dataset = MyDataset(data)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# printing the first batch
for batch in dataloader: 
    print(batch)
    break


['cold.', ("feel'st", 'it')]


[18, 47, 56, 57, 58, 1, 15, 47, 58]

# simple NGramModel using pytorch

In [14]:
CONTEXT_SIZE = 2
EMBEDDING_DIM = 10

test_sentence = text.split()

test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()


trigram = [((test_sentence[i], test_sentence[i + 1]), test_sentence[i + 2])for i in range(len(test_sentence) - 2)]

vocb = set(test_sentence)
word_to_idx = {word: i for i, word in enumerate(vocb)}
idx_to_word = {word_to_idx[word]: word for word in word_to_idx}

class NgramModel(nn.Module):
    def __init__(self, vocb_size, context_size, n_dim):
        super(NgramModel, self).__init__()
        self.n_word = vocb_size
        self.embedding = nn.Embedding(self.n_word, n_dim)
        self.linear1 = nn.Linear(context_size * n_dim, 128)
        self.linear2 = nn.Linear(128, self.n_word)

    def forward(self, x):
        emb = self.embedding(x)
        emb = emb.view(1, -1)
        out = self.linear1(emb)
        out = F.relu(out)
        out = self.linear2(out)
        log_prob = F.log_softmax(out)
        return log_prob

ngrammodel = NgramModel(len(word_to_idx), CONTEXT_SIZE, 100)
criterion = nn.NLLLoss()
optimizer = optim.SGD(ngrammodel.parameters(), lr=1e-3)

for epoch in range(100):
    print('epoch: {}'.format(epoch + 1))
    print('*' * 10)
    running_loss = 0
    for data in trigram:
        word, label = data
        word = Variable(torch.LongTensor([word_to_idx[i] for i in word]))
        label = Variable(torch.LongTensor([word_to_idx[label]]))
        # forward
        out = ngrammodel(word)
        loss = criterion(out, label)
        running_loss += loss.item()
        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print('Loss: {:.6f}'.format(running_loss / len(word_to_idx)))


epoch: 1
**********
Loss: 5.353088
epoch: 2
**********
Loss: 5.295852
epoch: 3
**********
Loss: 5.239206
epoch: 4
**********


  log_prob = F.log_softmax(out)


Loss: 5.183216
epoch: 5
**********
Loss: 5.127892
epoch: 6
**********
Loss: 5.073198
epoch: 7
**********
Loss: 5.018655
epoch: 8
**********
Loss: 4.964472
epoch: 9
**********
Loss: 4.910336
epoch: 10
**********
Loss: 4.856346
epoch: 11
**********
Loss: 4.802462
epoch: 12
**********
Loss: 4.748453
epoch: 13
**********
Loss: 4.694087
epoch: 14
**********
Loss: 4.639548
epoch: 15
**********
Loss: 4.584818
epoch: 16
**********
Loss: 4.529846
epoch: 17
**********
Loss: 4.474448
epoch: 18
**********
Loss: 4.418889
epoch: 19
**********
Loss: 4.362829
epoch: 20
**********
Loss: 4.306433
epoch: 21
**********
Loss: 4.249741
epoch: 22
**********
Loss: 4.192754
epoch: 23
**********
Loss: 4.135303
epoch: 24
**********
Loss: 4.077417
epoch: 25
**********
Loss: 4.019286
epoch: 26
**********
Loss: 3.960602
epoch: 27
**********
Loss: 3.901751
epoch: 28
**********
Loss: 3.842317
epoch: 29
**********
Loss: 3.782586
epoch: 30
**********
Loss: 3.722470
epoch: 31
**********
Loss: 3.661944
epoch: 32
********

In [19]:
trigram[3]


tensor([12, 76])

In [15]:
word, label = trigram[3]
word = Variable(torch.LongTensor([word_to_idx[i] for i in word]))
out = ngrammodel(word)
_, predict_label = torch.max(out, 1)
predict_word = idx_to_word[predict_label.item()]
print('real word is {}, predict word is {}'.format(label, predict_word))

real word is thy, predict word is thy


  log_prob = F.log_softmax(out)


## GPT and language models
https://github.com/iVishalr/GPT/tree/main

In [None]:
# https://medium.com/@mingzehe/implement-transformer-via-pytorch-step-by-step-part-2-69f020d580c6

#attention 
def attention(k,q,v):
    # q dim [batch_size,n_heads,length,d_tensor]
    d_tensor = q.size(-1) 
    # assume dim of query/key/value vector should be same 
    # and it should be to make below calculation happen      
    k_t = k.transpose(-2,-1) #[batch_size,n_heads,d_tensor,length]
    score = (q @ k_t)/math.sqrt(d_tensor)
    v= torch.softmax(score,dim=-1) @ v
    return v,score

import copy
def clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_head):
        super(MultiHeadAttention, self).__init__()
  # reduced dim for each Q,K,V, but added up to d_model
        self.d_k = d_model // n_head 
        self.n_head = n_head
        self.attn = None
  # use the attention class defined above
        self.attention = attention() 

  # 3 for K,Q,V, the forth layer is on the top for final attention score
        self.linears = clones(nn.Linear(d_model, d_model), 4) 

    def forward(self, q, k, v):
        samples = q.size(0) #q init as 512x512
    # split tensor by number of heads
        q, k, v = [   lin(x).view(samples, -1, self.n_head, self.d_k).transpose(1, 2)
    # [512,512] => [512,1,8,64] => [512,8,1,64] now we have 8 heads, 
    #length 1 since conv of size 1, dim of 64 for each q,k,v, 
    #ready for input to attention [batch_size, head, length, d_tensor]
            for lin, x in zip(self.linears, (q, k, v)) 
    # we only used 3 first linear layers since zip would 
        ]
        
    # calculate the attention score 
        x, self.attn = attention(q, k, v)

    # concat by view func [512, 8, 1, 64] => [512,1,512] add it back to 512
        x = (x.transpose(1, 2).contiguous().view(samples, -1, self.n_head * self.d_k))
    # now apply the final linear layer copy
        return self.linears[-1](x) 
   

class EncoderLayer(nn.Module):
    def __init__(self,n_head,d_model,hidden):
        super(Encoder_layer, self).__init__()
        self.norm = nn.LayerNorm(layer.size)
        self.attention_layer= MultiHeadAttention(d_model, n_head)
        self.feed_forward_layer= FeedForwardLayer(d_model, hidden)

    def forward(self, x):
        # we make a copy for later residue adding
        _x = x
        # use multi-head attention we defined in part 1
        atten = self.attention_layer(x)
        # add residue and normalize layer
        _atten = _x + self.norm(atten)
        # feed forward layer which we will define later 
        x = self.feed_forward_layer(x)
        return self.norm(x)+_atten

class FeedForwardLayer(nn.Module):
    def __init__(self, d_model, hidden):
        super(FeedForwardLayer, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        return x

class Encoder(nn.Module):
    def __init__(self, d_model, hidden, n_head, n_copy):
        super().__init__()
        # n_copy = 6 
        self.layers = clones(EncoderLayer(d_model,hidden,n_head), n_copy)

    def forward(self, x):
        x = layer(x)
        return x


class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len):
        super(PositionalEncoding, self).__init__()
        # init as 515x512 matrix to make adding pos with input possible
        self.encoding = torch.zeros(max_len, d_model)
        # produce 0 to 511 pos index 
        pos = torch.arange(0, max_len)
        # convert to 512x1 size
        pos = pos.float().unsqueeze(dim=1)
        # pick 0,2,4...etc 256 even numbers, 
        # _2i refers to the index in above formula
        _2i = torch.arange(0, d_model, step=2).float()
        # pos index (512,1) divide by _2i (256)
        # broadcasting to (512,256), so every even column apply sin func
        self.encoding[:, 0::2] = torch.sin(pos / (10000 ** (_2i / d_model)))
        # odd column go through cos func
        self.encoding[:, 1::2] = torch.cos(pos / (10000 ** (_2i / d_model)))
        
    def forward(self, x):
        batch_size, seq_len = x.size() 
        #now to apply encoding
        return self.encoding[:seq_len, :]
        


In [None]:
class SimpleLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(SimpleLanguageModel, self).__init__()
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        # LSTM layers
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        # Output layer
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        # Embedding layer
        embedded = self.embedding(x)
        # LSTM layers
        output, hidden = self.lstm(embedded, hidden)
        # Output layer
        output = self.fc(output)
        return output, hidden

# Example usage:
# Set your vocabulary size, embedding dimension, hidden dimension, and number of LSTM layers
vocab_size = 100  # replace with the actual size of your vocabulary
embedding_dim = 64
hidden_dim = 128
num_layers = 2

# Create an instance of the SimpleLanguageModel
model = SimpleLanguageModel(vocab_size, embedding_dim, hidden_dim, num_layers)

# Print the model architecture
print(model)


In [None]:
# https://pytorch.org/tutorials/intermediate/char_rnn_generation_tutorial.html

from io import open
import glob
import os
import unicodedata
import string

all_letters = string.ascii_letters + " .,;'-"
n_letters = len(all_letters) + 1 # Plus EOS marker

def findFiles(path): return glob.glob(path)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# Read a file and split into lines
def readLines(filename):
    with open(filename, encoding='utf-8') as some_file:
        return [unicodeToAscii(line.strip()) for line in some_file]

# Build the category_lines dictionary, a list of lines per category
category_lines = {}
all_categories = []
for filename in findFiles('data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

if n_categories == 0:
    raise RuntimeError('Data not found. Make sure that you downloaded data '
        'from https://download.pytorch.org/tutorial/data.zip and extract it to '
        'the current directory.')

print('# categories:', n_categories, all_categories)
print(unicodeToAscii("O'Néàl"))


class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size

        self.i2h = nn.Linear(n_categories + input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(n_categories + input_size + hidden_size, output_size)
        self.o2o = nn.Linear(hidden_size + output_size, output_size)
        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, category, input, hidden):
        input_combined = torch.cat((category, input, hidden), 1)
        hidden = self.i2h(input_combined)
        output = self.i2o(input_combined)
        output_combined = torch.cat((hidden, output), 1)
        output = self.o2o(output_combined)
        output = self.dropout(output)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

import random

# Random item from a list
def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

# Get a random category and random line from that category
def randomTrainingPair():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    return category, line

# One-hot vector for category
def categoryTensor(category):
    li = all_categories.index(category)
    tensor = torch.zeros(1, n_categories)
    tensor[0][li] = 1
    return tensor

# One-hot matrix of first to last letters (not including EOS) for input
def inputTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li in range(len(line)):
        letter = line[li]
        tensor[li][0][all_letters.find(letter)] = 1
    return tensor

# ``LongTensor`` of second letter to end (EOS) for target
def targetTensor(line):
    letter_indexes = [all_letters.find(line[li]) for li in range(1, len(line))]
    letter_indexes.append(n_letters - 1) # EOS
    return torch.LongTensor(letter_indexes)

# Make category, input, and target tensors from a random category, line pair
def randomTrainingExample():
    category, line = randomTrainingPair()
    category_tensor = categoryTensor(category)
    input_line_tensor = inputTensor(line)
    target_line_tensor = targetTensor(line)
    return category_tensor, input_line_tensor, target_line_tensor

criterion = nn.NLLLoss()

learning_rate = 0.0005

def train(category_tensor, input_line_tensor, target_line_tensor):
    target_line_tensor.unsqueeze_(-1)
    hidden = rnn.initHidden()

    rnn.zero_grad()

    loss = torch.Tensor([0]) # you can also just simply use ``loss = 0``

    for i in range(input_line_tensor.size(0)):
        output, hidden = rnn(category_tensor, input_line_tensor[i], hidden)
        l = criterion(output, target_line_tensor[i])
        loss += l

    loss.backward()

    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)

    return output, loss.item() / input_line_tensor.size(0)

import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

rnn = RNN(n_letters, 128, n_letters)

n_iters = 100000
print_every = 5000
plot_every = 500
all_losses = []
total_loss = 0 # Reset every ``plot_every`` ``iters``

start = time.time()

for iter in range(1, n_iters + 1):
    output, loss = train(*randomTrainingExample())
    total_loss += loss

    if iter % print_every == 0:
        print('%s (%d %d%%) %.4f' % (timeSince(start), iter, iter / n_iters * 100, loss))

    if iter % plot_every == 0:
        all_losses.append(total_loss / plot_every)
        total_loss = 0


In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.plot(all_losses)

max_length = 20

# Sample from a category and starting letter
def sample(category, start_letter='A'):
    with torch.no_grad():  # no need to track history in sampling
        category_tensor = categoryTensor(category)
        input = inputTensor(start_letter)
        hidden = rnn.initHidden()

        output_name = start_letter

        for i in range(max_length):
            output, hidden = rnn(category_tensor, input[0], hidden)
            topv, topi = output.topk(1)
            topi = topi[0][0]
            if topi == n_letters - 1:
                break
            else:
                letter = all_letters[topi]
                output_name += letter
            input = inputTensor(letter)

        return output_name

# Get multiple samples from one category and multiple starting letters
def samples(category, start_letters='ABC'):
    for start_letter in start_letters:
        print(sample(category, start_letter))

samples('Russian', 'RUS')

samples('German', 'GER')

samples('Spanish', 'SPA')

samples('Chinese', 'CHI')
