In [32]:
def count_games_in_pgn(pgn_file):
    game_count = 0
    with open(pgn_file, 'r', encoding='utf-8') as f:
        for line in f:
            if line.startswith("[Event "):  # Each game starts with the "[Event ..." tag
                game_count += 1
    return game_count

# Example usage:
pgn_file = "../combined_games.pgn"  # The combined PGN file
num_games = count_games_in_pgn(pgn_file)

print(f"The combined PGN file contains {num_games} games.")


The combined PGN file contains 2817287 games.


In [3]:
pip install chardet

Collecting chardet
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading chardet-5.2.0-py3-none-any.whl (199 kB)
Installing collected packages: chardet
Successfully installed chardet-5.2.0
Note: you may need to restart the kernel to use updated packages.


In [6]:
# detect_encoding.py

import chardet

def detect_file_encoding(file_path, num_bytes=100000):
    """
    Detects the encoding of a file using chardet.

    Parameters:
    - file_path (str): Path to the file.
    - num_bytes (int): Number of bytes to read for detection.

    Returns:
    - str: Detected encoding or 'utf-8' as default.
    """
    try:
        with open(file_path, 'rb') as f:
            rawdata = f.read(num_bytes)
        result = chardet.detect(rawdata)
        encoding = result['encoding']
        confidence = result['confidence']
        print(f"Detected encoding for {file_path}: {encoding} with confidence {confidence}")
        return encoding if encoding else 'utf-8'
    except Exception as e:
        print(f"Failed to detect encoding for {file_path}: {e}")
        return 'utf-8'

detect_file_encoding("../huge1pgn.pgn")

Detected encoding for ../huge1pgn.pgn: ascii with confidence 1.0


'ascii'

In [18]:
def reencode_file_to_utf8(input_path, output_path, original_encoding='latin-1'):
    try:
        with open(input_path, 'r', encoding=original_encoding, errors='replace') as infile:
            content = infile.read()
        with open(output_path, 'w', encoding='utf-8', errors='replace') as outfile:
            outfile.write(content)
        print(f"Re-encoded {input_path} from {original_encoding} to UTF-8 and saved as {output_path}")
    except Exception as e:
        print(f"Failed to re-encode {input_path}: {e}")

In [22]:
reencode_file_to_utf8('../Games/twic1503.pgn', '../utf8_games/twic1503.pgn', original_encoding='latin-1')

Re-encoded ../Games/twic1503.pgn from latin-1 to UTF-8 and saved as ../Games/twic1503_utf8.pgn


In [27]:
import os
import logging
from pathlib import Path
def setup_logging():
    """
    Sets up logging to file and console.
    """
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)

    # Create handlers
    c_handler = logging.StreamHandler()
    f_handler = logging.FileHandler('reencode.log')

    c_handler.setLevel(logging.INFO)
    f_handler.setLevel(logging.INFO)

    # Create formatters and add them to handlers
    c_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    f_format = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')

    c_handler.setFormatter(c_format)
    f_handler.setFormatter(f_format)

    # Add handlers to the logger
    logger.addHandler(c_handler)
    logger.addHandler(f_handler)
    
    
def main():
    # Set up logging
    setup_logging()

    # Define the input and output directories
    input_dir = Path('../huge_games')
    output_dir = Path('../utf8_huge_games')

    # Ensure the output directory exists
    if not output_dir.exists():
        try:
            output_dir.mkdir(parents=True, exist_ok=True)
            logging.info(f"Created output directory: '{output_dir}'")
        except Exception as e:
            logging.error(f"Failed to create output directory '{output_dir}': {e}")
            return

    # Iterate through all .pgn files in the input directory
    pgn_files = list(input_dir.glob('*.pgn'))
    if not pgn_files:
        logging.warning(f"No PGN files found in '{input_dir}'. Exiting.")
        return

    logging.info(f"Found {len(pgn_files)} PGN file(s) in '{input_dir}'. Starting re-encoding...")

    for pgn_file in pgn_files:
        # Define the output file path
        output_file = output_dir / pgn_file.name

        # Check if the output file already exists to prevent overwriting
        if output_file.exists():
            logging.warning(f"Output file '{output_file}' already exists. Skipping re-encoding for this file.")
            continue

        # Re-encode the file
        reencode_file_to_utf8(pgn_file, output_file, original_encoding='latin-1')

    logging.info("Re-encoding process completed.")

if __name__ == "__main__":
    main()

2024-10-07 15:16:11,882 - INFO - Created output directory: '..\utf8_huge_games'
2024-10-07 15:16:11,882 - INFO - Created output directory: '..\utf8_huge_games'
2024-10-07 15:16:11,884 - INFO - Found 4 PGN file(s) in '..\huge_games'. Starting re-encoding...
2024-10-07 15:16:11,884 - INFO - Found 4 PGN file(s) in '..\huge_games'. Starting re-encoding...


Re-encoded ..\huge_games\combined_games.pgn from latin-1 to UTF-8 and saved as ..\utf8_huge_games\combined_games.pgn
Re-encoded ..\huge_games\corr2011pgn.pgn from latin-1 to UTF-8 and saved as ..\utf8_huge_games\corr2011pgn.pgn
Re-encoded ..\huge_games\huge1pgn.pgn from latin-1 to UTF-8 and saved as ..\utf8_huge_games\huge1pgn.pgn


2024-10-07 15:16:29,314 - INFO - Re-encoding process completed.
2024-10-07 15:16:29,314 - INFO - Re-encoding process completed.


Re-encoded ..\huge_games\huge2pgn.pgn from latin-1 to UTF-8 and saved as ..\utf8_huge_games\huge2pgn.pgn
