### Cleanup all games

This code takes all games and removes all games that are not acceptable. The criteria are: 
- If either player does not have a FIDE Id it is probably a bot and the game is rejected
- Games with less than 15 moves that end in a draw are most likely arranged draws and are thus rejected
- Games with no moves are rejected
- Games with no recorded result are rejected

The remaining games are saved in new files with the suffix `_processed.csv`


In [3]:
import glob
import os
import re
import anal_games
import functions_anal

# Ensure the output directory exists
output_dir = "../Cleaned_Analyzed_Games"
os.makedirs(output_dir, exist_ok=True)

# Get the list of filenames matching the patterns
filenames_15 = glob.glob("../Analyzed_Games/twic*_15_analyzed.csv")
filenames_16 = glob.glob("../Analyzed_Games/twic*_16_analyzed.csv")
filenames_20 = glob.glob("../Analyzed_Games/twic*_20_analyzed.csv")
# Create dictionaries to map numbers to filenames
filenames_15_dict = {}
for filename in filenames_15:
    basename = os.path.basename(filename)
    match = re.match(r'twic(\d+)_15_analyzed\.csv', basename)
    if match:
        number = match.group(1)
        filenames_15_dict[number] = filename

filenames_16_dict = {}
for filename in filenames_16:
    basename = os.path.basename(filename)
    match = re.match(r'twic(\d+)_16_analyzed\.csv', basename)
    if match:
        number = match.group(1)
        filenames_16_dict[number] = filename
        
filenames_20_dict = {}
for filename in filenames_20:
    basename = os.path.basename(filename)
    match = re.match(r'twic(\d+)_20_analyzed\.csv', basename)
    if match:
        number = match.group(1)
        filenames_20_dict[number] = filename
        
# Combine the numbers and prioritize '_20' files
all_numbers = set(filenames_15_dict.keys()).union(set(filenames_16_dict.keys())).union(set(filenames_20_dict.keys()))
filenames_to_process = []

for number in sorted(all_numbers):
    if number in filenames_20_dict:
        # If both versions exist, keep only the '_20' version, then the '_16' version
        filenames_to_process.append(filenames_20_dict[number])
    elif number in filenames_16_dict:
        filenames_to_process.append(filenames_16_dict[number])
    elif number in filenames_15_dict:
        filenames_to_process.append(filenames_15_dict[number])

# Process each file individually
for file in filenames_to_process:
    # Get the base filename without the directory
    base_filename = os.path.basename(file)

    # Determine the output filename in the new directory
    output_file = os.path.join(output_dir, base_filename.replace("_analyzed.csv", "_processed.csv"))

    # # # Check if the output file already exists; skip processing if it does
    # if os.path.isfile(output_file):
    #     print(f"Output file {output_file} already exists. Skipping {file}.")
    #     continue

    print(f"Processing {file}...")

    # Process the file using process_one_file
    df = anal_games.process_one_file(
        filename=file,
        functions=[functions_anal.Cleanup],  # Returns None if a game is not acceptable
        game_wise=False  # Set to False if you want to retain move data
    )

    # Save the processed DataFrame to the output CSV file
    df.to_csv(output_file, index=False)

    print(f"Saved processed data to {output_file}")


Output file ../Cleaned_Analyzed_Games\twic1200_16_processed.csv already exists. Skipping ../Analyzed_Games\twic1200_16_analyzed.csv.
Output file ../Cleaned_Analyzed_Games\twic1201_16_processed.csv already exists. Skipping ../Analyzed_Games\twic1201_16_analyzed.csv.
Output file ../Cleaned_Analyzed_Games\twic1202_16_processed.csv already exists. Skipping ../Analyzed_Games\twic1202_16_analyzed.csv.
Output file ../Cleaned_Analyzed_Games\twic1203_16_processed.csv already exists. Skipping ../Analyzed_Games\twic1203_16_analyzed.csv.
Output file ../Cleaned_Analyzed_Games\twic1204_16_processed.csv already exists. Skipping ../Analyzed_Games\twic1204_16_analyzed.csv.
Output file ../Cleaned_Analyzed_Games\twic1205_16_processed.csv already exists. Skipping ../Analyzed_Games\twic1205_16_analyzed.csv.
Output file ../Cleaned_Analyzed_Games\twic1206_16_processed.csv already exists. Skipping ../Analyzed_Games\twic1206_16_analyzed.csv.
Output file ../Cleaned_Analyzed_Games\twic1207_16_processed.csv alrea