In [2]:
from tinygrad.tensor import Tensor
from tinygrad.nn import Embedding
from datasets import load_dataset
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt


In [3]:
# Load the dataset
dataset = load_dataset("MathLLMs/MathVision")

# Example: Focusing on specific fields in the dataset (e.g., text, labels)
df = pd.DataFrame(dataset['train'])  # Assuming 'train' is the split
text_data = df['text'].tolist()

# Tokenization and processing
def tokenize_text(text_list):
    return [list(map(ord, text)) for text in text_list]  # Tokenizing text as character ordinals

tokens = tokenize_text(text_data)
max_length = max(len(seq) for seq in tokens)


KeyError: 'train'

In [None]:
def pad_sequence(seq, max_len):
    return seq + [0] * (max_len - len(seq))

padded_tokens = [pad_sequence(seq, max_length) for seq in tokens]
token_tensor = Tensor(np.array(padded_tokens, dtype=np.float32))


In [None]:
class SimpleModel:
    def __init__(self, vocab_size, embed_dim):
        self.embedding = Embedding(vocab_size, embed_dim)

    def forward(self, x):
        return self.embedding(x)

# Assume the vocabulary size based on tokenization
vocab_size = 128  # Example for ASCII character set
embed_dim = 50  # Embedding dimensionality
model = SimpleModel(vocab_size, embed_dim)

# Pass token tensor through the embedding layer
embedded_data = model.forward(token_tensor)


In [None]:
# Extracting embeddings as feature vectors
features = embedded_data.data  # raw embedding vectors from TinyGrad Tensor

# Reducing the dimensionality for visualization using PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
reduced_features = pca.fit_transform(features)


In [None]:
# Applying KMeans clustering
kmeans = KMeans(n_clusters=5)  # Choose number of clusters based on the problem domain
kmeans.fit(reduced_features)

# Add cluster labels to the data
df['cluster'] = kmeans.labels_


In [None]:
plt.scatter(reduced_features[:, 0], reduced_features[:, 1], c=kmeans.labels_, cmap='viridis')
plt.title('Clustered Representation of Embedded Data')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()


In [None]:
def train_step(model, data, target, optimizer):
    out = model.forward(data)
    loss = ((out - target) ** 2).mean()  # Simple MSE loss for this example
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss

# Simple gradient descent optimizer for demonstration
from tinygrad.nn.optim import SGD

optimizer = SGD([model.embedding.weight], lr=0.001)

# Training loop
epochs = 10
for epoch in range(epochs):
    loss = train_step(model, token_tensor, target=token_tensor, optimizer=optimizer)
    print(f"Epoch {epoch+1}, Loss: {loss}")
