In [None]:
import pandas as pd
import glob
import os

### Concatenate Game Files

Concatenates all analyzed and cleaned game files into one big game file, modifying GameIDs so that they stay unique. 

As an example and to limit CPU usage, only files with depth 20 are used here. 

In [None]:
def combine_csv_files_with_unique_gameid(input_directory, output_file):
    """
    Combines all analyzed CSV files in the specified directory into a single CSV file,
    adjusting the GameID to ensure uniqueness across all files.

    Parameters:
    - input_directory (str): The directory containing the CSV files to combine.
    - output_file (str): The path to the output CSV file.
    """
    # Define patterns to match your file names
    patterns = [
        #os.path.join(input_directory, "twic*_analyzed.csv"),
        os.path.join(input_directory, "twic*_20_processed.csv"),
    ]

    # Collect all matching files
    csv_files = []
    for pattern in patterns:
        matched_files = glob.glob(pattern)
        csv_files.extend(matched_files)

    # Remove duplicates (if any)
    csv_files = list(set(csv_files))

    # Sort files by name to maintain order
    csv_files.sort()

    # Initialize variables
    df_list = []
    total_games_processed = 0  # Keeps track of the total number of games processed so far

    for file in csv_files:
        try:
            df = pd.read_csv(file)

            # Check if 'GameID' column exists
            if 'GameID' not in df.columns:
                print(f"'GameID' column not found in {file}. Skipping file.")
                continue

            # Get the number of unique games in this file
            unique_game_ids = df['GameID'].nunique()

            # Adjust the GameID by adding total_games_processed
            df['GameID'] += total_games_processed

            # Update total_games_processed
            total_games_processed += unique_game_ids

            df_list.append(df)
            print(f"Processed {file}: {unique_game_ids} games.")
        except Exception as e:
            print(f"Failed to read {file}: {e}")

    # Concatenate all dataframes
    if df_list:
        combined_df = pd.concat(df_list, ignore_index=True)
        combined_df.to_csv(output_file, index=False)
        print(f"Combined {len(df_list)} files into {output_file} with {total_games_processed} unique games.")
    else:
        print("No files were combined.")

In [None]:
input_directory = "../Cleaned_Analyzed_Games" 
output_file = "../huge_analyzed_games/combined_analyzed_games_20.csv"
combine_csv_files_with_unique_gameid(input_directory, output_file)

In [None]:
def combine_pgn_files(input_directory, output_file, start_num=920, end_num=1560):
    """
    Combines multiple PGN files into a single PGN file.

    Parameters:
    - input_directory (str): Directory where the PGN files are located.
    - output_file (str): Path to the combined PGN output file.
    - start_num (int): Starting number in the file names.
    - end_num (int): Ending number in the file names.
    """
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for num in range(start_num, end_num + 1):
            filename = os.path.join(input_directory, f"twic{num}.pgn")
            if os.path.exists(filename):
                try:
                    with open(filename, 'r', encoding='utf-8') as infile:
                        contents = infile.read()
                        outfile.write(contents)
                        # Ensure separation between games from different files
                        outfile.write('\n\n')
                    print(f"Added {filename}")
                except Exception as e:
                    print(f"Failed to read {filename}: {e}")
            else:
                print(f"File {filename} does not exist. Skipping.")
    print(f"Combined PGN files into {output_file}")

# Example usage:
input_directory = "../utf8_games"  # Replace with the path to your PGN files
output_file = "../combined_games.pgn"          # The output file where combined PGNs will be saved
combine_pgn_files(input_directory, output_file)


In [None]:
def count_games_in_pgn_file(pgn_file):
    count = 0
    with open(pgn_file, 'r', encoding='utf-8') as f:
        for line in f:
            if line.startswith('[Event '):
                count += 1
    print(f"Total number of games in {pgn_file}: {count}")
    return count

# Example usage:
pgn_file = "../combined_games.pgn"
count_games_in_pgn_file(pgn_file)
