In [1]:
%pip install numpy
%pip install pandas
%pip install matplotlib
%pip install datasets
%pip install kagglehub
%pip install transformers
%pip install accelerate
%pip install latex2sympy
%pip install --upgrade torch torchvision torchaudio
%pip install --upgrade torchtext

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (1

In [2]:
def print_entries(dataset, start=0, end=10, split=""):
  if split == "":
    for split in dataset:
      print(f"Entries {start+1} - {end} of the {split} data:")
      for i in range(start, end):
        print(dataset[split][i])
      print("-" * 20)
  else:
    print(f"Entries {start+1} - {end} of the {split} data:")
    for i in range(start, end):
      print(dataset[split][i])
    print("-" * 20)

In [3]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt

In [4]:
from datasets import load_dataset

cot_ds = load_dataset("AI-MO/NuminaMath-CoT")

print("Before preprocessing")
print(cot_ds)
print()

# Preprocess COT dataset
cot_ds['train'] = cot_ds['train'].remove_columns(['messages'])
cot_ds['test'] = cot_ds['test'].remove_columns(['messages'])
cot_ds['train'] = cot_ds['train'].remove_columns(['source'])
cot_ds['test'] = cot_ds['test'].remove_columns(['source'])
print("After preprocessing")
print(cot_ds)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/2.68k [00:00<?, ?B/s]

train-00000-of-00005.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

train-00001-of-00005.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

train-00002-of-00005.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

train-00003-of-00005.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

train-00004-of-00005.parquet:   0%|          | 0.00/247M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/166k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/859494 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Before preprocessing
DatasetDict({
    train: Dataset({
        features: ['source', 'problem', 'solution', 'messages'],
        num_rows: 859494
    })
    test: Dataset({
        features: ['source', 'problem', 'solution', 'messages'],
        num_rows: 100
    })
})

After preprocessing
DatasetDict({
    train: Dataset({
        features: ['problem', 'solution'],
        num_rows: 859494
    })
    test: Dataset({
        features: ['problem', 'solution'],
        num_rows: 100
    })
})


In [5]:
# Remove chinese characters from COT dataset
import re

def contains_chinese(text):
    # match Chinese characters
    pattern = re.compile(r'[\u4e00-\u9fff\u2e80-\u2eff\u31c0-\u31ef\uff00-\uffef]')
    return bool(pattern.search(text))

def filter_entries(dataset, fields):
    # Filter out entries that contain Chinese characters
    filtered_dataset = dataset.filter(lambda example: not any(contains_chinese(example[field]) for field in fields))
    return filtered_dataset

# remove entries with Chinese characters
fields_to_check = ['problem', 'solution']
cot_ds['train'] = filter_entries(cot_ds['train'], fields_to_check)
cot_ds['test'] = filter_entries(cot_ds['test'], fields_to_check)
print(cot_ds)

Filter:   0%|          | 0/859494 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['problem', 'solution'],
        num_rows: 850151
    })
    test: Dataset({
        features: ['problem', 'solution'],
        num_rows: 100
    })
})


In [6]:
# print first 10 entries for COT dataset

print(cot_ds)
print_entries(cot_ds)

DatasetDict({
    train: Dataset({
        features: ['problem', 'solution'],
        num_rows: 850151
    })
    test: Dataset({
        features: ['problem', 'solution'],
        num_rows: 100
    })
})
Entries 1 - 10 of the train data:
{'problem': 'Consider the terms of an arithmetic sequence: $-\\frac{1}{3}, y+2, 4y, \\ldots$. Solve for $y$.', 'solution': 'For an arithmetic sequence, the difference between consecutive terms must be equal. Therefore, we can set up the following equations based on the sequence given:\n\\[ (y + 2) - \\left(-\\frac{1}{3}\\right) = 4y - (y+2) \\]\n\nSimplify and solve these equations:\n\\[ y + 2 + \\frac{1}{3} = 4y - y - 2 \\]\n\\[ y + \\frac{7}{3} = 3y - 2 \\]\n\\[ \\frac{7}{3} + 2 = 3y - y \\]\n\\[ \\frac{13}{3} = 2y \\]\n\\[ y = \\frac{13}{6} \\]\n\nThus, the value of $y$ that satisfies the given arithmetic sequence is $\\boxed{\\frac{13}{6}}$.'}
{'problem': 'Suppose that $g(x) = 5x - 3$. What is $g^{-1}(g^{-1}(14))$?', 'solution': 'First, we need to

In [7]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mathurinache/math-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/mathurinache/math-dataset?dataset_version_number=1...


100%|██████████| 7.07M/7.07M [00:00<00:00, 52.3MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/mathurinache/math-dataset/versions/1


In [8]:
# Preprocess MATH dataset (load all json files into into Dataset object)

import os
from datasets import Dataset, DatasetDict

def load_json_files(data_dir):
    """Loads JSON files from a directory into a Dataset."""
    all_data = []
    problems = 0
    for subdir in os.listdir(data_dir):
      subdir_path = os.path.join(data_dir, subdir)
      for filename in os.listdir(subdir_path):
        if filename.endswith(".json"):
          problems += 1
          filepath = os.path.join(subdir_path, filename)
          with open(filepath, "r") as f:
            all_data.append(json.load(f))
    # Create a Pandas DataFrame to easily convert into a Dataset\
    print(f"Loaded {problems} problems.")
    return all_data

# Assuming 'path' is from kagglehub.dataset_download
math_dir = os.path.join(path, "MATH")
train_dir = os.path.join(math_dir, "train")
test_dir = os.path.join(math_dir, "test")

train_data = load_json_files(train_dir)
test_data = load_json_files(test_dir)

# Convert the train and test data into Dataset objects
train_dataset = Dataset.from_dict({
    "problem": [item["problem"] for item in train_data],
    # "level": [item["level"] for item in train_data],
    # "type": [item["type"] for item in train_data],
    "solution": [item["solution"] for item in train_data]
})

test_dataset = Dataset.from_dict({
    "problem": [item["problem"] for item in test_data],
    # "level": [item["level"] for item in test_data],
    # "type": [item["type"] for item in test_data],
    "solution": [item["solution"] for item in test_data]
})

math_ds = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})



Loaded 7500 problems.
Loaded 5000 problems.


In [9]:
# print first 10 entries for COT dataset

print(math_ds)
print_entries(math_ds)

DatasetDict({
    train: Dataset({
        features: ['problem', 'solution'],
        num_rows: 7500
    })
    test: Dataset({
        features: ['problem', 'solution'],
        num_rows: 5000
    })
})
Entries 1 - 10 of the train data:
{'problem': 'Two lines are perpendicular.  One line has a direction vector of $\\begin{pmatrix} 3 \\\\ -7 \\end{pmatrix}.$  The other line has a direction vector of $\\begin{pmatrix} a \\\\ 2 \\end{pmatrix}.$  Find $a.$', 'solution': 'Since the two lines are perpendicular, their direction vectors are orthogonal.  This means that the dot product of the direction vectors is 0:\n\\[\\begin{pmatrix} 3 \\\\ -7 \\end{pmatrix} \\cdot \\begin{pmatrix} a \\\\ 2 \\end{pmatrix} = 0.\\]Then $3a - 14 = 0,$ so $a = \\boxed{\\frac{14}{3}}.$'}
{'problem': 'Let $\\mathbf{a}$ and $\\mathbf{b}$ be vectors, and let $\\mathbf{m}$ be the midpoint of $\\mathbf{a}$ and $\\mathbf{b}.$  Given $\\mathbf{m} = \\begin{pmatrix} 3 \\\\ 7 \\end{pmatrix}$ and $\\mathbf{a} \\cdot \\mat

In [10]:
from datasets import concatenate_datasets

# Make validation dataset
train_valid_split = cot_ds['train'].train_test_split(test_size=0.1)
cot_ds['train'] = train_valid_split['train']
cot_ds['test'] = train_valid_split['test']

train_valid_split = cot_ds['train'].train_test_split(test_size=0.12)
cot_ds['train'] = train_valid_split['train']
cot_ds['validation'] = train_valid_split['test']

# Add MATH dataset as test dataset
merged_math = concatenate_datasets([math_ds['train'], math_ds['test']])
cot_ds['test'] = concatenate_datasets([cot_ds['test'], merged_math])

ds = cot_ds
print(ds)

print()
print("Split")
print("train:", len(ds['train']) / ( len(ds['train']) + len(ds['validation']) + len(ds['test']) ))
print("test:", len(ds['test']) / ( len(ds['train']) + len(ds['validation']) + len(ds['test']) ))
print("validation:", len(ds['validation']) / ( len(ds['train']) + len(ds['validation']) + len(ds['test']) ))

DatasetDict({
    train: Dataset({
        features: ['problem', 'solution'],
        num_rows: 673318
    })
    test: Dataset({
        features: ['problem', 'solution'],
        num_rows: 97516
    })
    validation: Dataset({
        features: ['problem', 'solution'],
        num_rows: 91817
    })
})

Split
train: 0.7805219028320839
test: 0.11304223840232029
validation: 0.10643585876559582


In [11]:
print(ds)
print("train:", len(ds['train']) / ( len(ds['train']) + len(ds['validation']) + len(ds['test']) ))
print("test:", len(ds['test']) / ( len(ds['train']) + len(ds['validation']) + len(ds['test']) ))
print("validation:", len(ds['validation']) / ( len(ds['train']) + len(ds['validation']) + len(ds['test']) ))

DatasetDict({
    train: Dataset({
        features: ['problem', 'solution'],
        num_rows: 673318
    })
    test: Dataset({
        features: ['problem', 'solution'],
        num_rows: 97516
    })
    validation: Dataset({
        features: ['problem', 'solution'],
        num_rows: 91817
    })
})
train: 0.7805219028320839
test: 0.11304223840232029
validation: 0.10643585876559582


In [12]:
# # Tokenize Data

# from transformers import AutoTokenizer
# model_name = "tbs17/MathBERT"
# tokenizer = AutoTokenizer.from_pretrained(model_name)

# ds = ds.map(lambda entries: tokenizer(entries['problem'], entries['solution']), batched=True)

# print(ds)

In [13]:
print(ds)
print_entries(ds)

DatasetDict({
    train: Dataset({
        features: ['problem', 'solution'],
        num_rows: 673318
    })
    test: Dataset({
        features: ['problem', 'solution'],
        num_rows: 97516
    })
    validation: Dataset({
        features: ['problem', 'solution'],
        num_rows: 91817
    })
})
Entries 1 - 10 of the train data:
{'problem': 'In the following image, there are equilateral triangles $ABC$, $DBE$, $IEF$, and $HIG$. The areas of triangles $DBE$, $IEF$, and $HIG$ are in the ratio $9:16:4$. In what ratio are\n\n1. the lengths of the segments $HI$ and $IE$,\n2. the areas of triangles $ABC$ and $HEC$?\n\n(K. Pazourek)', 'solution': "\nLet's analyze the given problem using geometric principles and the properties of equilateral triangles.\n\nGiven that the areas of triangles $\\triangle DBE$, $\\triangle IEF$, and $\\triangle HIG$ are in the ratio 9:16:4, we need to determine two things:\n1. The ratio of the lengths of segments $HI$ and $IE$.\n2. The ratio of the areas 

In [None]:
# Generate embeddings
import torch
from transformers import AutoModel, AutoTokenizer
model_name = "tbs17/MathBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)  # Tokenizer
model = AutoModel.from_pretrained(model_name)  # Model for embeddings


def generate_embeddings(text):
  inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
  with torch.no_grad():  # Disable gradient calcuim tokenizing first, then lation during inference
    outputs = model(**inputs)
  embeddings = outputs.last_hidden_state[:, 0, :]  # Get the [CLS] token embedding
  return embeddings

ds = ds.map(lambda entries: {
    'problem_embeddings': generate_embeddings(entries['problem']),
    'solution_embeddings': generate_embeddings(entries['solution'])
}, batched=True)

torch.save(model.state_dict(), 'mathbert_weights.pth')
# model.load_state_dict(torch.load('mathbert_weights.pth'))

print(ds)

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/441M [00:00<?, ?B/s]

Map:   0%|          | 0/673318 [00:00<?, ? examples/s]

In [None]:
print(ds)

In [None]:
from latex2sympy import latex2sympy
from sympy import symbols, Eq, solve
import re

problem = cot_ds['train'][0]['problem']
print(problem)

# Convert LaTeX to SymPy expression
parts = re.split(r'(?<!\\)\$(.*?)(?<!\\)\$', problem)
sympy_parts = []

for part in parts:
    if re.match(r'(?<!\\)\$(.*?)(?<!\\)\$', part):  # Check if LaTeX
        try:
            sympy_expr = latex2sympy(part[1:-1])  # Remove $ signs
            sympy_parts.append(sympy_expr)
        except Exception as e:
            print(f"Error converting LaTeX to SymPy: {e}")
            sympy_parts.append(part)  # Keep original if conversion fails
    else:
        sympy_parts.append(part)


# Assuming the equation is the second SymPy expression in sympy_parts
equation = sympy_parts[1]

# Define the variable 'y'
y = symbols('y')

# # Solve the equation for 'y'
# solutions = solve(equation, y)

# # Print the solutions
# print("Solutions for y:", solutions)

In [None]:
# import torch
# from torch.utils.data import DataLoader
# from torch.nn.utils.rnn import pad_sequence
# import torch.optim as optim
# import torch.nn as nn
# import torchtext
# from torchtext.data.utils import get_tokenizer
# from torchtext.vocab import Vocab, build_vocab_from_iterator