# Data: modular addition

We will start with a problem that consists in adding or substracting two numbers modulo an other number.

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
def addition(x, y, z):
    return (x + y) % z

def substraction(x, y, z):
    return (x - y) % z

We will consider two scenarios:
- z is fixed to be 60.
- z could be 2, 3, 4, 5 or 6.

Note that because 60 divides all the other numbers, solving the second tasks could be done by solving the first tasks and learning the modulo operation `x % z`.

## Tokenization
We are going to use language modeling to solve this problem.
To do so, we need to define a vocabulary.
We will work in base 60 and consider each number as a token.
We will equally consider the token `+` and `-`.

In [3]:
tokens_list = [k for k in range(60)] + ['+', '-']
vocab_size = len(tokens_list)

tokens_id = {tokens_list[k]: k for k in range(len(tokens_list))}

## Dataset generation

We will choose a simple data generation that simple consider all possible pairwise operations.

For the case where `z=60` is fixed, we only consider addition for the moment.

In [4]:
import csv
import os
from llmtuto.config import DATA_DIR


data = []
for x in range(60):
    for y in range(60):
        out = addition(x, y, 60)
        data.append([tokens_id[x], tokens_id[y], tokens_id[out]])

        # out = addition(x, y, 60)
        # data.append([tokens_id[x], tokens_id['+'], tokens_id[y], tokens_id[out]])

        # out = substraction(x, y, 60)
        # data.append([tokens_id[x], tokens_id['-'], tokens_id[y], tokens_id[out]])


save_dir = DATA_DIR / 'single_base'
os.makedirs(save_dir, exist_ok=True)

with open(save_dir / f'data.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerows(data)

In [5]:
data = []
zs = [2, 3, 4, 5, 6]

for x in range(60):
    for y in range(60):
        for z in zs:
            out = addition(x, y, z)
            data.append([tokens_id[x], tokens_id['+'], tokens_id[y], tokens_id[z], tokens_id[out]])

            out = substraction(x, y, z)
            data.append([tokens_id[x], tokens_id['-'], tokens_id[y], tokens_id[z], tokens_id[out]])


save_dir = DATA_DIR / 'multi_base'
os.makedirs(save_dir, exist_ok=True)

with open(save_dir / 'data.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerows(data)

## Train/test split

We keep 80 percent of the data for training, randomly permute them, and save the result as torch tensors.

In [6]:
import numpy as np


def count_file_size(file_path: str) -> (int, int):
    """ Count lines and columns of a csv file."""
    line_count = column_count = 0
    with open(file_path) as f:
        while f.readline():
            if not column_count:
                column_count = len(f.readline().split(','))
            line_count += 1
        line_count += 1
    return line_count, column_count


def get_datasplit(problem, train_percentage, rng, dtype=np.int32):
    """ Split dataset into train and test set.

    Parameters
    ----------
    problem: str
        Name of the problem, either 'single_base' or 'multi_base'.
    train_percentage: float
        Percentage of data to be used for training.
    rng: numpy.random.Generator
        Random number generator.
    
    Returns
    -------
    x_train: numpy.ndarray
        Training input data.
    y_train: numpy.ndarray
        Training output data.
    x_test: numpy.ndarray
        Test input data.
    y_test: numpy.ndarray
        Test output data.
    """

    save_dir = DATA_DIR / problem
    try:
        n_data, seq_dim = count_file_size(save_dir / 'data.csv')
    except FileNotFoundError:
        raise FileNotFoundError(f"Raw data file {save_dir}/data.csv not found.")

    n_train_bool_idx = rng.random(n_data) < train_percentage
    n_train = np.sum(n_train_bool_idx).astype(dtype)
    n_test = n_data - n_train

    train = np.empty((n_train, seq_dim), dtype=dtype)
    test = np.empty((n_test, seq_dim), dtype=dtype)
    with open(save_dir / 'data.csv') as f:
        csv_reader = csv.reader(f)
        train_idx = test_idx = 0
        for i, row in enumerate(csv_reader):
            if n_train_bool_idx[i]:
                train[train_idx] = row
                train_idx += 1
            else:
                test[test_idx] = row
                test_idx += 1
    
    train = rng.permutation(train, axis=0)
    x_train, y_train = train[:, :-1], train[:, -1]
    x_test, y_test = test[:, :-1], test[:, -1]

    return x_train, y_train, x_test, y_test


#------------------------------#
# Save data as pytorch tensors #
#------------------------------#

import torch

train_percentage = 0.8
rng = np.random.default_rng(12345)

for problem in ['single_base', 'multi_base']:
    x_train, y_train, x_test, y_test = get_datasplit(problem, train_percentage, rng)

    x_train = torch.tensor(x_train)
    y_train = torch.tensor(y_train)
    x_test = torch.tensor(x_test)
    y_test = torch.tensor(y_test)

    torch.save(x_train, DATA_DIR / problem / f'x_train.pt')
    torch.save(y_train, DATA_DIR / problem / f'y_train.pt')
    torch.save(x_test, DATA_DIR / problem / f'x_test.pt')
    torch.save(y_test, DATA_DIR / problem / f'y_test.pt')