In [None]:
import re
from collections import defaultdict

def get_vocab(data):
    """ Given a list of strings, returns a dictionary of words mapping to their frequency count in the data. """
    vocab = defaultdict(int)
    for line in data:
        for word in line.split():
            # We add ' </w>' to indicate the end of a word
            vocab[' '.join(list(word)) + ' </w>'] += 1
    return vocab

def get_stats(vocab):
    """ Given a vocabulary (dictionary mapping words to frequency counts), returns a dictionary of tuples representing the frequency count of pairs of characters in the vocabulary. """
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[symbols[i], symbols[i + 1]] += freq
    return pairs

def merge_vocab(pair, v_in):
    """ Merges the most frequent pair in the vocabulary. """
    v_out = {}
    bigram = re.escape(' '.join(pair))
    p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
    for word in v_in:
        w_out = p.sub(''.join(pair), word)
        v_out[w_out] = v_in[word]
    return v_out

def byte_pair_encoding(data, num_merges):
    """ Performs byte pair encoding on the input data. """
    vocab = get_vocab(data)
    for i in range(num_merges):
        pairs = get_stats(vocab)
        if not pairs:
            break
        best_pair = max(pairs, key=pairs.get)
        vocab = merge_vocab(best_pair, vocab)
    return vocab

# Example usage:
data = ["this is a test", "this is another test", "why is this a test"]
num_merges = 10
bpe_vocab = byte_pair_encoding(data, num_merges)

# Print the vocabulary with BPE merges
print(bpe_vocab)
print(get_vocab(data))
print(get_stats(get_vocab(data)))

pairs = get_stats(get_vocab(data))

In [None]:
pair = ('e', 's')
bigram = re.escape(' '.join(pair))  # This becomes 'e s'
p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')

In [None]:
pairs

In [None]:
for i in pairs:
    print(i)

In [None]:
sentence = "roses are red violets are blue"

# The compiled pattern 'p' will be used to find matches
matches = p.finditer(sentence)

for match in matches:
    print(match.group())

In [None]:
max(pairs, key=pairs.get)

In [None]:
pairs

In [None]:
import matplotlib.pyplot as plt

# Assuming you have a function `byte_pair_encoding` that returns the vocabulary
# and a function `validate_model` that returns a performance metric

num_merges_options = [1000, 5000, 10000, 15000, 20000]  # Example merge options
vocab_sizes = []
validation_scores = []

from sklearn.metrics import accuracy_score

def validate_model(model, X_val, y_val):
    """
    Validates a given model's performance on a validation dataset.

    Parameters:
    - model: The trained machine learning model to be validated.
    - X_val: The input features of the validation dataset.
    - y_val: The true labels of the validation dataset.

    Returns:
    - accuracy: The accuracy of the model on the validation dataset.
    """
    # Make predictions using the model on the validation data
    predictions = model.predict(X_val)
    
    # Calculate the accuracy of the predictions
    accuracy = accuracy_score(y_val, predictions)
    
    return accuracy


for num_merges in num_merges_options:
    vocab = byte_pair_encoding(data, num_merges)
    vocab_sizes.append(len(vocab))
    
    # Train your model here with the vocab and evaluate on the validation set
    score = validate_model(model, validation_data)
    validation_scores.append(score)

# Now plot the trends
plt.figure(figsize=(14, 7))

plt.subplot(1, 2, 1)
plt.plot(num_merges_options, vocab_sizes, marker='o')
plt.title('Vocabulary Size vs. Number of Merges')
plt.xlabel('Number of Merges')
plt.ylabel('Vocabulary Size')

plt.subplot(1, 2, 2)
plt.plot(num_merges_options, validation_scores, marker='o', color='r')
plt.title('Validation Score vs. Number of Merges')
plt.xlabel('Number of Merges')
plt.ylabel('Validation Score')

plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Dummy byte_pair_encoding function for demonstration
def byte_pair_encoding(data, num_merges):
    # This function is a placeholder and does not perform actual BPE.
    # It returns a dummy vocabulary size based on num_merges for demonstration.
    return {'vocab_size': 1000 + num_merges}

# Function to validate the model's performance
def validate_model(model, X_val, y_val):
    predictions = model.predict(X_val)
    accuracy = accuracy_score(y_val, predictions)
    return accuracy

# Load the Iris dataset
iris = load_iris()
X, y = iris.data, iris.target

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the logistic regression model
model = LogisticRegression(max_iter=200)

# Train the model
model.fit(X_train, y_train)

# Placeholder for the actual validation data
validation_data = (X_val, y_val)

num_merges_options = [1000, 5000, 10000, 15000, 20000]
vocab_sizes = []
validation_scores = []

for num_merges in num_merges_options:
    vocab = byte_pair_encoding(None, num_merges)  # Data parameter is not used in this dummy function
    vocab_sizes.append(vocab['vocab_size'])
    
    score = validate_model(model, *validation_data)
    validation_scores.append(score)

# Plotting the trends
plt.figure(figsize=(14, 7))

plt.subplot(1, 2, 1)
plt.plot(num_merges_options, vocab_sizes, marker='o')
plt.title('Vocabulary Size vs. Number of Merges')
plt.xlabel('Number of Merges')
plt.ylabel('Vocabulary Size')

plt.subplot(1, 2, 2)
plt.plot(num_merges_options, validation_scores, marker='o', color='r')
plt.title('Validation Score vs. Number of Merges')
plt.xlabel('Number of Merges')
plt.ylabel('Validation Score')

plt.tight_layout()
plt.show()


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
encoding = tokenizer(example)
print(type(encoding))

In [None]:
tokenizer.is_fast

In [None]:
import pandas as pd

# Assuming you have a DataFrame with columns 'words' and 'entities'
data = {
    'words': ['Barack', 'Obama', 'was', 'the', '44th', 'president', 'of', 'the', 'USA'],
    'entities': ['B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC']
}

df = pd.DataFrame(data)

# Convert DataFrame to IOB format
def convert_to_iob(df):
    iob_data = []
    for _, row in df.iterrows():
        word, entity = row['words'], row['entities']
        iob_data.append((word, entity))
    return iob_data

iob_formatted_data = convert_to_iob(df)

In [None]:
iob_formatted_data

In [None]:
# Install transformers from source - only needed for versions <= v4.34
# pip install git+https://github.com/huggingface/transformers.git
# pip install accelerate

import torch
from transformers import pipeline

pipe = pipeline("text-generation", model="HuggingFaceH4/zephyr-7b-beta", torch_dtype=torch.bfloat16, device_map="auto")

# We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating
messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot who always responds in the style of a pirate",
    },
    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
]
prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
print(outputs[0]["generated_text"])
# <|system|>
# You are a friendly chatbot who always responds in the style of a pirate.</s>
# <|user|>
# How many helicopters can a human eat in one sitting?</s>
# <|assistant|>
# Ah, me hearty matey! But yer question be a puzzler! A human cannot eat a helicopter in one sitting, as helicopters are not edible. They be made of metal, plastic, and other materials, not food!


In [None]:
# from mlx_lm import load, generate

# model, tokenizer = load("mistralai/Mistral-7B-v0.1")

# response = generate(model, tokenizer, prompt="hello", verbose=True)

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_id = "mistralai/Mistral-7B-v0.1"  # or "amazon/MistralLite" for the optimized version
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]