# Convert PGN to extended LAN (xLAN)

### Config conversion

In [None]:
from src.data_preprocessing.pgn_to_xlan import pgn_to_xlan


pgn_path = "D:\LEON Safe\Datasets\lichess_db_standard_rated_2014-09.pgn"
lan_path = "D:\LEON Safe\Datasets\lichess_db_standard_rated_2014-09.xlan"

min_number_of_moves_per_game = 0
number_of_games_to_write = -1  # -1 for all games

pgn_to_lan = pgn_to_xlan(
    pgn_path,
    lan_path,
    min_number_of_moves_per_game=min_number_of_moves_per_game,
    number_of_games_to_write=number_of_games_to_write,
    generate_all_moves=False,
    log=False,
    xLanPlus=False,
)

pgn_to_lan.convert_pgn_parallel()

# Check Common and Duplicate Lines


In [None]:
"""
Use check_duplicates_and_common_lines to check if there are duplicates or common lines in two files.
"""

from src.data_preprocessing.check_duplicates_and_common_lines import (
    check_duplicates_and_common_lines,
)

training_file = "D:\LEON Safe\Datasets\lichess_db_standard_rated_2023-09.xlanplus"
validation_file = (
    "D:\LEON Safe\Datasets\lichess_db_standard_rated_2014-09_1000Lines.xlan"
)

check_duplicates_and_common_lines(
    training_file,
    validation_file,
    delete_common=False,
    delete_dubplicates_from_file_1=True,
    delete_dubplicates_from_file_2=False,
)

# Tokenize Data


In [None]:
from src.tokenizer.tokenizer import tokenize_file

xLAN_path = "D:\LEON Safe\Datasets\lichess_db_standard_rated_2023-09.xlanplus"
token_path = "./src/tokenizer/xlanplus_tokens.json"
tokenized_path = "D:\LEON Safe\Datasets\lichess_db_standard_rated_2023-09.tok"

tokenize_file(token_path, xLAN_path, tokenized_path, batch_size=20000)

# Detokenize Data

In [None]:
from src.tokenizer.detokenizer import detokenize_data

tokens = "6 32 34 76 6 37 35 76 6 24 26 76 6 29 28 76 5 55 49 76 5 62 52 76 5 15 25 76 6 45 44 76 4 23 59 76 4 54 45 76 6 40 41 76 1 46 62 76 2 31 24 76 5 22 37 76 3 7 31 76 6 69 68 76 4 59 66 76 6 35 26 81 4 47 26 81 5 52 35 76 4 66 45 81 2 38 45 81 4 26 40 76 5 35 25 81 2 24 25 81 5 37 52 76 5 49 43 76 5 52 42 76 2 25 17 76 2 45 52 76 1 39 55 76 3 54 38 76 4 40 33 76 5 42 36 76 2 17 24 76 5 36 46 76 3 31 23 76 5 46 29 76 6 56 57 76 4 30 37 76 2 24 27 76 4 37 46 76 2 27 9 76 6 13 12 76 6 48 50 76 6 21 19 76 5 43 58 76 6 19 18 76 5 58 52 79 6 61 52 81 2 9 18 81 3 14 22 76 2 18 32 76 5 29 35 76 6 41 42 76 5 35 18 76 4 33 26 76 6 28 27 76 6 50 51 76 6 27 34 81 6 51 44 81 4 46 19 76 6 44 53 79 1 62 54 76 2 32 68 79 1 54 45 76 2 68 52 79 1 45 54 76 4 26 19 81 3 22 19 81 2 52 68 77 1 54 45 76 3 23 29 77 3 38 37 76 2 53 54 78 71 74"
token_path = "./src/tokenizer/xlanplus_tokens.json"

print(detokenize_data(tokens, token_path))

# Remove lines with more than x tokens


In [None]:
def remove_lines_with_too_many_tokens(
    input_file_path, output_file_path, token_limit=500
):
    with open(input_file_path, "r") as file:
        lines = file.readlines()

    print(f"Number of lines in {input_file_path}: {len(lines)}")
    lines_to_keep = []
    removed_count = 0

    for line in lines:
        if len(line.split()) <= token_limit:
            lines_to_keep.append(line)
        else:
            removed_count += 1

    print(f"Number of lines in {output_file_path}: {len(lines_to_keep)}")
    with open(output_file_path, "w") as file:
        file.writelines(lines_to_keep)

    return removed_count

In [None]:
input_file_path = "D:\LEON Safe\Datasets\lichess_db_standard_rated_2023-09.tok"
output_file_path = "D:\LEON Safe\Datasets\lichess_db_standard_rated_2023-09_max_500.tok"
removed_lines = remove_lines_with_too_many_tokens(input_file_path, output_file_path)
print(f"Number of removed lines: {removed_lines}")

# Create files with only one line for each starting sequence

### Break down big Dataset to smaller dataset with more variety

In [None]:
"""
    Remove Duplicate Lines by Start Sequence
    -------------------

    Removes duplicate lines from a file by comparing the first n tokens of each line.
    The first n tokens are called the start sequence.

    Parameters:

    lines: The lines to remove duplicates from.
    start_sequences: A list of integers. Each integer is the length of the start sequence.

    Returns:

    A dictionary with the start sequence length as key and the list of lines as value.

"""


def remove_duplicates_by_start_sequence(lines, start_sequences, debug=True):
    result = {length: [] for length in start_sequences}
    starting_sequences_sets = {length: set() for length in start_sequences}

    for line in lines:
        tokens = line.strip().split()
        for length in start_sequences:
            sequence = " ".join(tokens[:length])
            if sequence not in starting_sequences_sets[length]:
                starting_sequences_sets[length].add(sequence)
                result[length].append(line)

    if debug:
        print("Original:", len(lines))
        for length in start_sequences:
            print(
                f"Stripped duplicates for first {length} tokens:", len(result[length])
            )

    return result

In [None]:
file_path = "D:\LEON Safe\Datasets\lichess_db_standard_rated_2023-09_max_500.tok"
with open(file_path, "r", encoding="utf-8") as file:
    lines = file.readlines()

start_sequences = [13, 16, 20, 24]
results = remove_duplicates_by_start_sequence(lines, start_sequences, debug=True)

for length, saved_lines in results.items():
    out_path = f"D:\LEON Safe\Datasets\lichess_db_standard_rated_2023-09_max_500_{length}tokens.tok"
    with open(out_path, "w", encoding="utf-8") as file:
        file.writelines(saved_lines)

# Convert file from xLan to xLan+

In [None]:
from src.notation_converter import xlan_sequence_to_xlanplus


def convert_xlan_to_xlanplus(xlan_file, xlanplus_file):
    with open(xlan_file, "r") as file:
        lines = file.readlines()

    with open(xlanplus_file, "w") as file:
        for line in lines:
            # if empty line or line starts with # copy it
            if not line.strip() or line.startswith("#"):
                file.write(line)
                continue

            xlan_plus = xlan_sequence_to_xlanplus(line)
            file.write(xlan_plus + "\n")

In [None]:
xlan_file = "./data/validation/board_state/board_state_positions.lan"
xlanplus_file = "./data/validation/board_state/board_state_positions.xlanplus"
convert_xlan_to_xlanplus(xlan_file, xlanplus_file)