In [2]:
import pandas as pd
import glob
import os

def combine_csv_files_with_unique_gameid(input_directory, output_file):
    """
    Combines all analyzed CSV files in the specified directory into a single CSV file,
    adjusting the GameID to ensure uniqueness across all files.

    Parameters:
    - input_directory (str): The directory containing the CSV files to combine.
    - output_file (str): The path to the output CSV file.
    """
    # Define patterns to match your file names
    patterns = [
        os.path.join(input_directory, "twic*_analyzed.csv"),
        os.path.join(input_directory, "twic*_*_analyzed.csv"),
    ]

    # Collect all matching files
    csv_files = []
    for pattern in patterns:
        matched_files = glob.glob(pattern)
        csv_files.extend(matched_files)

    # Remove duplicates (if any)
    csv_files = list(set(csv_files))

    # Sort files by name to maintain order
    csv_files.sort()

    # Initialize variables
    df_list = []
    total_games_processed = 0  # Keeps track of the total number of games processed so far

    for file in csv_files:
        try:
            df = pd.read_csv(file)

            # Check if 'GameID' column exists
            if 'GameID' not in df.columns:
                print(f"'GameID' column not found in {file}. Skipping file.")
                continue

            # Get the number of unique games in this file
            unique_game_ids = df['GameID'].nunique()

            # Adjust the GameID by adding total_games_processed
            df['GameID'] += total_games_processed

            # Update total_games_processed
            total_games_processed += unique_game_ids

            df_list.append(df)
            print(f"Processed {file}: {unique_game_ids} games.")
        except Exception as e:
            print(f"Failed to read {file}: {e}")

    # Concatenate all dataframes
    if df_list:
        combined_df = pd.concat(df_list, ignore_index=True)
        combined_df.to_csv(output_file, index=False)
        print(f"Combined {len(df_list)} files into {output_file} with {total_games_processed} unique games.")
    else:
        print("No files were combined.")

# Example usage:
input_directory = "../Analyzed_Games"  # Replace with your directory containing the CSV files
output_file = "../huge_analyzed_games/combined_analyzed_games.csv"
combine_csv_files_with_unique_gameid(input_directory, output_file)


Processed ../Analyzed_Games\twic1450_15_analyzed.csv: 6322 games.
Processed ../Analyzed_Games\twic1451_15_analyzed.csv: 6290 games.
Processed ../Analyzed_Games\twic1452_15_analyzed.csv: 5653 games.
Processed ../Analyzed_Games\twic1453_15_analyzed.csv: 6706 games.
Processed ../Analyzed_Games\twic1454_15_analyzed.csv: 6405 games.
Processed ../Analyzed_Games\twic1455_15_analyzed.csv: 5747 games.
Processed ../Analyzed_Games\twic1456_15_analyzed.csv: 5287 games.
Processed ../Analyzed_Games\twic1500_analyzed.csv: 7964 games.
Processed ../Analyzed_Games\twic1501_analyzed.csv: 8468 games.
Processed ../Analyzed_Games\twic1502_analyzed.csv: 10464 games.
Processed ../Analyzed_Games\twic1503_analyzed.csv: 10529 games.
Processed ../Analyzed_Games\twic1504_analyzed.csv: 7530 games.
Processed ../Analyzed_Games\twic1505_analyzed.csv: 8640 games.
Processed ../Analyzed_Games\twic1506_analyzed.csv: 7721 games.
Processed ../Analyzed_Games\twic1507_analyzed.csv: 7381 games.
Processed ../Analyzed_Games\twic

  df = pd.read_csv(file)


Processed ../Analyzed_Games\twic1549_15_analyzed.csv: 11234 games.
Processed ../Analyzed_Games\twic1550_14_analyzed.csv: 12430 games.
Processed ../Analyzed_Games\twic1551_14_analyzed.csv: 10532 games.
Processed ../Analyzed_Games\twic1552_14_analyzed.csv: 10544 games.
Processed ../Analyzed_Games\twic1553_14_analyzed.csv: 10947 games.
Processed ../Analyzed_Games\twic1554_14_analyzed.csv: 12380 games.
Processed ../Analyzed_Games\twic1555_14_analyzed.csv: 11363 games.
Processed ../Analyzed_Games\twic1556_15_analyzed.csv: 10133 games.
Processed ../Analyzed_Games\twic1557_15_analyzed.csv: 8501 games.
Processed ../Analyzed_Games\twic1558_15_analyzed.csv: 10086 games.
Processed ../Analyzed_Games\twic1559_15_analyzed.csv: 10090 games.
Processed ../Analyzed_Games\twic1560_15_analyzed.csv: 7721 games.
Combined 68 files into ../huge_analyzed_games/combined_analyzed_games.csv with 565611 unique games.


In [1]:
import os

def combine_pgn_files(input_directory, output_file, start_num=920, end_num=1560):
    """
    Combines multiple PGN files into a single PGN file.

    Parameters:
    - input_directory (str): Directory where the PGN files are located.
    - output_file (str): Path to the combined PGN output file.
    - start_num (int): Starting number in the file names.
    - end_num (int): Ending number in the file names.
    """
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for num in range(start_num, end_num + 1):
            filename = os.path.join(input_directory, f"twic{num}.pgn")
            if os.path.exists(filename):
                try:
                    with open(filename, 'r', encoding='utf-8') as infile:
                        contents = infile.read()
                        outfile.write(contents)
                        # Ensure separation between games from different files
                        outfile.write('\n\n')
                    print(f"Added {filename}")
                except Exception as e:
                    print(f"Failed to read {filename}: {e}")
            else:
                print(f"File {filename} does not exist. Skipping.")
    print(f"Combined PGN files into {output_file}")

# Example usage:
input_directory = "../utf8_games"  # Replace with the path to your PGN files
output_file = "../combined_games.pgn"          # The output file where combined PGNs will be saved
combine_pgn_files(input_directory, output_file)


Added ../utf8_games\twic920.pgn
Added ../utf8_games\twic921.pgn
Added ../utf8_games\twic922.pgn
Added ../utf8_games\twic923.pgn
Added ../utf8_games\twic924.pgn
Added ../utf8_games\twic925.pgn
Added ../utf8_games\twic926.pgn
Added ../utf8_games\twic927.pgn
Added ../utf8_games\twic928.pgn
Added ../utf8_games\twic929.pgn
Added ../utf8_games\twic930.pgn
Added ../utf8_games\twic931.pgn
Added ../utf8_games\twic932.pgn
Added ../utf8_games\twic933.pgn
Added ../utf8_games\twic934.pgn
Added ../utf8_games\twic935.pgn
Added ../utf8_games\twic936.pgn
Added ../utf8_games\twic937.pgn
Added ../utf8_games\twic938.pgn
Added ../utf8_games\twic939.pgn
Added ../utf8_games\twic940.pgn
Added ../utf8_games\twic941.pgn
Added ../utf8_games\twic942.pgn
Added ../utf8_games\twic943.pgn
Added ../utf8_games\twic944.pgn
Added ../utf8_games\twic945.pgn
Added ../utf8_games\twic946.pgn
Added ../utf8_games\twic947.pgn
Added ../utf8_games\twic948.pgn
Added ../utf8_games\twic949.pgn
Added ../utf8_games\twic950.pgn
Added ..

In [3]:
def count_games_in_pgn_file(pgn_file):
    count = 0
    with open(pgn_file, 'r', encoding='utf-8') as f:
        for line in f:
            if line.startswith('[Event '):
                count += 1
    print(f"Total number of games in {pgn_file}: {count}")
    return count

# Example usage:
pgn_file = "../combined_games.pgn"
count_games_in_pgn_file(pgn_file)


Total number of games in ../combined_games.pgn: 2806708


2806708