# **Transformer-Based Chess Engine**

### **References**
**Noever, D., Ciolino, M., & Kalin, J. (2020).**  
*The Chess Transformer: Mastering Play using Generative Language Models.*  
[https://arxiv.org/abs/2008.04057](https://arxiv.org/abs/2008.04057)

In [9]:
import re
from pathlib import Path

def preprocess_pgn_file(input_path, output_path):
	if output_path.exists():
		print(f"‚ö†Ô∏è {output_path.name} already exists, skipping...")
		return
	
	with open(input_path, encoding="utf-8", errors="ignore") as f:
		text = f.read()

	# Remove PGN tags ([Event], [Date], etc.)
	text = re.sub(r'\[.*?\]', '', text)

	# Remove comments {...}, (...) and ;
	text = re.sub(r'\{[^}]*\}', '', text)
	text = re.sub(r'\([^)]*\)', '', text)
	text = re.sub(r';[^\n]*', '', text)

	# Split each game (two line breaks between games)
	games = re.split(r'\n\s*\n', text)

	saved = 0

	with open(output_path, "w", encoding="utf-8") as out:
		for game in games:
			game = game.strip()
			if not game:
				continue

			# Compact multiple spaces and newlines
			clean = re.sub(r'\s+', ' ', game).strip()

			# Remove move numbers (1., 2., etc.)
			clean = re.sub(r'\b\d+\.\s*', '', clean)

			# Detect the result at the end and move it to the beginning
			clean = re.sub(r'^(.+?)\s(1-0|0-1|1/2-1/2|\*)$', r'[Result \2] \1', clean)

			# Only save if it has moves and a result
			if clean.startswith("[Result"):
				out.write(clean + "\n")
				saved += 1

	print(f"‚úÖ {Path(input_path).name} ‚Üí {Path(output_path).name} ({saved} games saved)")

def process_all_pgn_files(input_directory, output_directory):
	input_path = Path(input_directory)
	output_path = Path(output_directory)

	output_path.mkdir(parents=True, exist_ok=True)

	files = sorted(input_path.glob("*.pgn"))
	
	for pgn_file in files:
		output_file = output_path / pgn_file.name.replace(".pgn", ".txt")
		preprocess_pgn_file(pgn_file, output_file)

In [10]:
process_all_pgn_files(
	"data/chess/lichess-elite-database",
	"data/chess/lichess-elite-dataset")

‚ö†Ô∏è lichess_elite_2014-01.txt already exists, skipping...
‚ö†Ô∏è lichess_elite_2014-02.txt already exists, skipping...
‚ö†Ô∏è lichess_elite_2014-03.txt already exists, skipping...
‚ö†Ô∏è lichess_elite_2014-04.txt already exists, skipping...
‚ö†Ô∏è lichess_elite_2014-05.txt already exists, skipping...
‚ö†Ô∏è lichess_elite_2014-06.txt already exists, skipping...
‚ö†Ô∏è lichess_elite_2014-07.txt already exists, skipping...
‚ö†Ô∏è lichess_elite_2014-08.txt already exists, skipping...
‚ö†Ô∏è lichess_elite_2014-09.txt already exists, skipping...
‚ö†Ô∏è lichess_elite_2014-10.txt already exists, skipping...
‚ö†Ô∏è lichess_elite_2014-11.txt already exists, skipping...
‚ö†Ô∏è lichess_elite_2014-12.txt already exists, skipping...
‚ö†Ô∏è lichess_elite_2015-01.txt already exists, skipping...
‚ö†Ô∏è lichess_elite_2015-02.txt already exists, skipping...
‚ö†Ô∏è lichess_elite_2015-03.txt already exists, skipping...
‚ö†Ô∏è lichess_elite_2015-04.txt already exists, skipping...
‚ö†Ô∏è lichess_elite_201

In [None]:
def count_games_in_file(file_path):
	count = 0
	with open(file_path, encoding="utf-8", errors="ignore") as f:
		for line in f:
			if line.startswith("[Result"):
				count += 1
	return count

def total_games_in_directory(directory_path):
	total = 0
	path = Path(directory_path)
	files = sorted(path.glob("*.txt"))
	for txt_file in files:
		total += count_games_in_file(txt_file)
	return total

In [8]:
total = total_games_in_directory("data/chess/lichess-elite-dataset")
print(f"üéâ Total games processed: {total}")

üéâ Total games processed: 27014886
