In [None]:
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install datasets
!pip install kagglehub

In [None]:
!pip install transformers
!pip install accelerate
!pip install latex2sympy

In [None]:
!pip install --upgrade torch torchvision torchaudio
!pip install --upgrade torchtext

In [4]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt

In [49]:
from datasets import load_dataset

cot_ds = load_dataset("AI-MO/NuminaMath-CoT")

print("Before preprocessing")
print(cot_ds)

# Preprocess COT dataset
cot_ds['train'] = cot_ds['train'].remove_columns(['messages'])
cot_ds['test'] = cot_ds['test'].remove_columns(['messages'])
cot_ds['train'] = cot_ds['train'].remove_columns(['source'])
cot_ds['test'] = cot_ds['test'].remove_columns(['source'])
print("After preprocessing")
print(cot_ds)

In [52]:
# Remove chinese characters from COT dataset
import re

def contains_chinese(text):
    # match Chinese characters
    pattern = re.compile(r'[\u4e00-\u9fff\u2e80-\u2eff\u31c0-\u31ef\uff00-\uffef]')
    return bool(pattern.search(text))

def filter_entries(dataset, fields):
    # Filter out entries that contain Chinese characters
    filtered_dataset = dataset.filter(lambda example: not any(contains_chinese(example[field]) for field in fields))
    return filtered_dataset

# remove entries with Chinese characters
fields_to_check = ['problem', 'solution']
cot_ds['train'] = filter_entries(cot_ds['train'], fields_to_check)
cot_ds['test'] = filter_entries(cot_ds['test'], fields_to_check)
print(cot_ds)

Filter:   0%|          | 0/859494 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['problem', 'solution'],
        num_rows: 850151
    })
    test: Dataset({
        features: ['problem', 'solution'],
        num_rows: 100
    })
})


In [72]:
def print_entries(dataset, num_entries=10):
  for split in dataset:
    print(f"First {num_entries} entries of the {split} split:")
    for i in range(num_entries):
      print(dataset[split][i])
    print("-" * 20)

In [73]:
# print first 10 entries for COT dataset

print(cot_ds)
print_entries(cot_ds)

DatasetDict({
    train: Dataset({
        features: ['problem', 'solution'],
        num_rows: 850151
    })
    test: Dataset({
        features: ['problem', 'solution'],
        num_rows: 25100
    })
})
First 10 entries of the train split:
{'problem': 'Consider the terms of an arithmetic sequence: $-\\frac{1}{3}, y+2, 4y, \\ldots$. Solve for $y$.', 'solution': 'For an arithmetic sequence, the difference between consecutive terms must be equal. Therefore, we can set up the following equations based on the sequence given:\n\\[ (y + 2) - \\left(-\\frac{1}{3}\\right) = 4y - (y+2) \\]\n\nSimplify and solve these equations:\n\\[ y + 2 + \\frac{1}{3} = 4y - y - 2 \\]\n\\[ y + \\frac{7}{3} = 3y - 2 \\]\n\\[ \\frac{7}{3} + 2 = 3y - y \\]\n\\[ \\frac{13}{3} = 2y \\]\n\\[ y = \\frac{13}{6} \\]\n\nThus, the value of $y$ that satisfies the given arithmetic sequence is $\\boxed{\\frac{13}{6}}$.'}
{'problem': 'Suppose that $g(x) = 5x - 3$. What is $g^{-1}(g^{-1}(14))$?', 'solution': 'First, we ne

In [54]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mathurinache/math-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/mathurinache/math-dataset/versions/1


In [55]:
# Preprocess MATH dataset (load all json files into into Dataset object)

import os
from datasets import Dataset, DatasetDict

def load_json_files(data_dir):
    """Loads JSON files from a directory into a Dataset."""
    all_data = []
    problems = 0
    for subdir in os.listdir(data_dir):
      subdir_path = os.path.join(data_dir, subdir)
      for filename in os.listdir(subdir_path):
        if filename.endswith(".json"):
          problems += 1
          filepath = os.path.join(subdir_path, filename)
          with open(filepath, "r") as f:
            all_data.append(json.load(f))
    # Create a Pandas DataFrame to easily convert into a Dataset\
    print(f"Loaded {problems} problems.")
    return all_data

# Assuming 'path' is from kagglehub.dataset_download
math_dir = os.path.join(path, "MATH")
train_dir = os.path.join(math_dir, "train")
test_dir = os.path.join(math_dir, "test")

train_data = load_json_files(train_dir)
test_data = load_json_files(test_dir)

# Convert the train and test data into Dataset objects
train_dataset = Dataset.from_dict({
    "problem": [item["problem"] for item in train_data],
    # "level": [item["level"] for item in train_data],
    # "type": [item["type"] for item in train_data],
    "solution": [item["solution"] for item in train_data]
})

test_dataset = Dataset.from_dict({
    "problem": [item["problem"] for item in test_data],
    # "level": [item["level"] for item in test_data],
    # "type": [item["type"] for item in test_data],
    "solution": [item["solution"] for item in test_data]
})

math_ds = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})



Loaded 7500 problems.
Loaded 5000 problems.


In [65]:
# print first 10 entries for COT dataset

print(math_ds)
print_entries(math_ds, 10)

DatasetDict({
    train: Dataset({
        features: ['problem', 'solution'],
        num_rows: 7500
    })
    test: Dataset({
        features: ['problem', 'solution'],
        num_rows: 5000
    })
})


In [62]:
# Add MATH dataset as testing data

from datasets import concatenate_datasets

# Concatenate the datasets
merged_math = concatenate_datasets([math_ds['train'], math_ds['test']])
cot_ds['test'] = concatenate_datasets([cot_ds['test'], merged_math])

ds = cot_ds
print(ds)

DatasetDict({
    train: Dataset({
        features: ['problem', 'solution'],
        num_rows: 850151
    })
    test: Dataset({
        features: ['problem', 'solution'],
        num_rows: 25100
    })
})


In [77]:
# Tokenize Data

from transformers import AutoTokenizer
model_name = "tbs17/MathBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ds_tokenized = ds.map(lambda entries: tokenizer(entries["problem"]), batched=True)
ds_tokenized = ds.map(lambda entries: tokenizer(entries['problem'], entries['solution']), batched=True)

print(ds_tokenized)

Map:   0%|          | 0/850151 [00:00<?, ? examples/s]

Map:   0%|          | 0/25100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['problem', 'solution', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 850151
    })
    test: Dataset({
        features: ['problem', 'solution', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25100
    })
})


In [78]:
print_entries(ds_tokenized)

First 10 entries of the train split:
{'problem': 'Consider the terms of an arithmetic sequence: $-\\frac{1}{3}, y+2, 4y, \\ldots$. Solve for $y$.', 'solution': 'For an arithmetic sequence, the difference between consecutive terms must be equal. Therefore, we can set up the following equations based on the sequence given:\n\\[ (y + 2) - \\left(-\\frac{1}{3}\\right) = 4y - (y+2) \\]\n\nSimplify and solve these equations:\n\\[ y + 2 + \\frac{1}{3} = 4y - y - 2 \\]\n\\[ y + \\frac{7}{3} = 3y - 2 \\]\n\\[ \\frac{7}{3} + 2 = 3y - y \\]\n\\[ \\frac{13}{3} = 2y \\]\n\\[ y = \\frac{13}{6} \\]\n\nThus, the value of $y$ that satisfies the given arithmetic sequence is $\\boxed{\\frac{13}{6}}$.', 'input_ids': [101, 5136, 1996, 3408, 1997, 2019, 20204, 5537, 1024, 1002, 1011, 1032, 25312, 2278, 1063, 1015, 1065, 1063, 1017, 1065, 1010, 1061, 1009, 1016, 1010, 1018, 2100, 1010, 1032, 25510, 12868, 1002, 1012, 9611, 2005, 1002, 1061, 1002, 1012, 102, 2005, 2019, 20204, 5537, 1010, 1996, 4489, 2090, 54

In [40]:
from latex2sympy import latex2sympy
from sympy import symbols, Eq, solve
import re

problem = cot_ds['train'][0]['problem']
print(problem)

# Convert LaTeX to SymPy expression
parts = re.split(r'(?<!\\)\$(.*?)(?<!\\)\$', problem)
sympy_parts = []

for part in parts:
    if re.match(r'(?<!\\)\$(.*?)(?<!\\)\$', part):  # Check if LaTeX
        try:
            sympy_expr = latex2sympy(part[1:-1])  # Remove $ signs
            sympy_parts.append(sympy_expr)
        except Exception as e:
            print(f"Error converting LaTeX to SymPy: {e}")
            sympy_parts.append(part)  # Keep original if conversion fails
    else:
        sympy_parts.append(part)


# Assuming the equation is the second SymPy expression in sympy_parts
equation = sympy_parts[1]

# Define the variable 'y'
y = symbols('y')

# # Solve the equation for 'y'
# solutions = solve(equation, y)

# # Print the solutions
# print("Solutions for y:", solutions)

Consider the terms of an arithmetic sequence: $-\frac{1}{3}, y+2, 4y, \ldots$. Solve for $y$.
['Consider the terms of an arithmetic sequence: ', '-\\frac{1}{3}, y+2, 4y, \\ldots', '. Solve for ', 'y', '.']


In [None]:
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
import torch.nn as nn
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab, build_vocab_from_iterator