In [3]:
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install datasets
!pip install kagglehub

Collecting numpy
  Using cached numpy-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.3 MB)
Installing collected packages: numpy
Successfully installed numpy-2.1.3
Collecting pandas
  Using cached pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
Collecting python-dateutil>=2.8.2
  Using cached python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)
Collecting tzdata>=2022.7
  Using cached tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Collecting pytz>=2020.1
  Using cached pytz-2024.2-py2.py3-none-any.whl (508 kB)
Collecting six>=1.5
  Using cached six-1.16.0-py2.py3-none-any.whl (11 kB)
Installing collected packages: pytz, tzdata, six, python-dateutil, pandas
Successfully installed pandas-2.2.3 python-dateutil-2.9.0.post0 pytz-2024.2 six-1.16.0 tzdata-2024.2
Collecting matplotlib
  Using cached matplotlib-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.3 MB)
Collecting fonttools>=4.22.0
  Using cached fonttools-4

In [None]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt

In [None]:
from datasets import load_dataset

cot_ds = load_dataset("AI-MO/NuminaMath-CoT")

In [39]:
# Preprocess COT dataset

cot_ds['train'] = cot_ds['train'].remove_columns(['messages'])
cot_ds['test'] = cot_ds['test'].remove_columns(['messages'])
print(cot_ds)

DatasetDict({
    train: Dataset({
        features: ['source', 'problem', 'solution'],
        num_rows: 859494
    })
    test: Dataset({
        features: ['source', 'problem', 'solution'],
        num_rows: 100
    })
})


In [None]:
# Remove chinese characters from COT dataset
import re

def contains_chinese(text):
    # match Chinese characters
    pattern = re.compile(r'[\u4e00-\u9fff\u2e80-\u2eff\u31c0-\u31ef\uff00-\uffef]')
    return bool(pattern.search(text))

def filter_entries(dataset, fields):
    # Filter out entries that contain Chinese characters
    filtered_dataset = dataset.filter(lambda example: not any(contains_chinese(example[field]) for field in fields))
    return filtered_dataset

# remove entries with Chinese characters
fields_to_check = ['problem', 'solution']
cot_ds['train'] = filter_entries(cot_ds['train'], fields_to_check)
cot_ds['test'] = filter_entries(cot_ds['test'], fields_to_check)
print(cot_ds)

Filter:   0%|          | 0/850151 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['source', 'problem', 'solution'],
        num_rows: 850151
    })
    test: Dataset({
        features: ['source', 'problem', 'solution'],
        num_rows: 100
    })
})


In [42]:
# print first 10 entries for COT dataset

print(cot_ds)
for split in cot_ds:
  print(f"First 10 entries of the {split} split:")
  for i in range(10):
    print(cot_ds[split][i])
  print("-" * 20)

DatasetDict({
    train: Dataset({
        features: ['source', 'problem', 'solution'],
        num_rows: 859494
    })
    test: Dataset({
        features: ['source', 'problem', 'solution'],
        num_rows: 100
    })
})
First 10 entries of the train split:
{'source': 'synthetic_math', 'problem': 'Consider the terms of an arithmetic sequence: $-\\frac{1}{3}, y+2, 4y, \\ldots$. Solve for $y$.', 'solution': 'For an arithmetic sequence, the difference between consecutive terms must be equal. Therefore, we can set up the following equations based on the sequence given:\n\\[ (y + 2) - \\left(-\\frac{1}{3}\\right) = 4y - (y+2) \\]\n\nSimplify and solve these equations:\n\\[ y + 2 + \\frac{1}{3} = 4y - y - 2 \\]\n\\[ y + \\frac{7}{3} = 3y - 2 \\]\n\\[ \\frac{7}{3} + 2 = 3y - y \\]\n\\[ \\frac{13}{3} = 2y \\]\n\\[ y = \\frac{13}{6} \\]\n\nThus, the value of $y$ that satisfies the given arithmetic sequence is $\\boxed{\\frac{13}{6}}$.'}
{'source': 'synthetic_math', 'problem': 'Suppose that 

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mathurinache/math-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/mathurinache/math-dataset?dataset_version_number=1...


100%|██████████| 7.07M/7.07M [00:00<00:00, 67.5MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/mathurinache/math-dataset/versions/1


In [None]:
# Preprocess MATH dataset
import os
from datasets import Dataset, DatasetDict

def load_json_files(data_dir):
    """Loads JSON files from a directory into a Dataset."""
    all_data = []
    problems = 0
    for subdir in os.listdir(data_dir):
      subdir_path = os.path.join(data_dir, subdir)
      for filename in os.listdir(subdir_path):
        if filename.endswith(".json"):
          problems += 1
          filepath = os.path.join(subdir_path, filename)
          with open(filepath, "r") as f:
            all_data.append(json.load(f))
    # Create a Pandas DataFrame to easily convert into a Dataset\
    print(f"Loaded {problems} problems.")
    return all_data

# Assuming 'path' is from kagglehub.dataset_download
math_dir = os.path.join(path, "MATH")
train_dir = os.path.join(math_dir, "train")
test_dir = os.path.join(math_dir, "test")

train_data = load_json_files(train_dir)
test_data = load_json_files(test_dir)

# Convert the train and test data into Dataset objects
train_dataset = Dataset.from_dict({
    "problem": [item["problem"] for item in train_data],
    "level": [item["level"] for item in train_data],
    "type": [item["type"] for item in train_data],
    "solution": [item["solution"] for item in train_data]
})

test_dataset = Dataset.from_dict({
    "problem": [item["problem"] for item in test_data],
    "level": [item["level"] for item in test_data],
    "type": [item["type"] for item in test_data],
    "solution": [item["solution"] for item in test_data]
})

math_ds = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})



Loaded 7500 problems.
Loaded 5000 problems.


In [25]:
# print first 10 entries for MATH dataset

print(math_ds)
for split in math_ds:
  print(f"First 10 entries of the {split} split:")
  for i in range(10):
    print(math_ds[split][i])
  print("-" * 20)

DatasetDict({
    train: Dataset({
        features: ['problem', 'level', 'type', 'solution'],
        num_rows: 7500
    })
    test: Dataset({
        features: ['problem', 'level', 'type', 'solution'],
        num_rows: 5000
    })
})
First 10 entries of the train split:
{'problem': 'In the diagram, $\\triangle ABE$, $\\triangle BCE$ and $\\triangle CDE$ are right-angled, with $\\angle AEB=\\angle BEC = \\angle CED = 60^\\circ$, and $AE=24$. [asy]\npair A, B, C, D, E;\nA=(0,20.785);\nB=(0,0);\nC=(9,-5.196);\nD=(13.5,-2.598);\nE=(12,0);\ndraw(A--B--C--D--E--A);\ndraw(B--E);\ndraw(C--E);\nlabel("A", A, N);\nlabel("B", B, W);\nlabel("C", C, SW);\nlabel("D", D, dir(0));\nlabel("E", E, NE);\n[/asy] Find the perimeter of quadrilateral $ABCD.$', 'level': 'Level 3', 'type': 'Geometry', 'solution': 'Recognizing that all our triangles in the diagram are 30-60-90 triangles, we recall that the ratio of the longer leg to the hypotenuse in such a triangle is $\\frac{\\sqrt{3}}{2}$. Therefore, we 

In [2]:
!pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install --upgrade torchtext==0.12

Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Using cached https://download.pytorch.org/whl/cu118/torch-2.5.1%2Bcu118-cp310-cp310-linux_x86_64.whl (838.3 MB)
Collecting torchvision
  Using cached https://download.pytorch.org/whl/cu118/torchvision-0.20.1%2Bcu118-cp310-cp310-linux_x86_64.whl (6.5 MB)
Collecting torchaudio
  Using cached https://download.pytorch.org/whl/cu118/torchaudio-2.5.1%2Bcu118-cp310-cp310-linux_x86_64.whl (3.3 MB)
Collecting nvidia-cusparse-cu11==11.7.5.86
  Using cached https://download.pytorch.org/whl/cu118/nvidia_cusparse_cu11-11.7.5.86-py3-none-manylinux1_x86_64.whl (204.1 MB)
Collecting nvidia-cuda-runtime-cu11==11.8.89
  Using cached https://download.pytorch.org/whl/cu118/nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux1_x86_64.whl (875 kB)
Collecting nvidia-cusolver-cu11==11.4.1.48
  Using cached https://download.pytorch.org/whl/cu118/nvidia_cusolver_cu11-11.4.1.48-py3-none-manylinux1_x86_64.whl (128.2 MB)
Collecting nvidia

In [None]:
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

In [None]:
import torch.optim as optim
import torch.nn as nn

In [None]:
import torchtext

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab, build_vocab_from_iterator