In [3]:
import pandas as pd
import os

def analyze_csv_column_differences(reference_file_path, folder_to_check_path):
    """
    Compares CSV files within a given folder against a reference file
    for differences in column count and column names.

    Args:
        reference_file_path (str): The path to the CSV file serving as the pattern.
        folder_to_check_path (str): The path to the folder containing CSV files to compare.

    Returns:
        tuple: A tuple containing:
            - int: The count of files with a different number of columns.
            - int: The count of files with different column names (regardless of column count).
            - list: A list of paths to files with a different number of columns.
            - list: A list of paths to files with different column names.
    """
    try:
        reference_dataframe = pd.read_csv(reference_file_path)
        reference_columns = set(reference_dataframe.columns)
        reference_column_count = len(reference_columns)
    except FileNotFoundError:
        print(f"Error: Reference file '{reference_file_path}' not found.")
        return 0, 0, [], []
    except Exception as e:
        print(f"Error loading reference file '{reference_file_path}': {e}")
        return 0, 0, [], []

    diff_column_count = 0
    diff_column_names = 0
    files_with_diff_column_count = []
    files_with_diff_column_names = []

    for file_name in os.listdir(folder_to_check_path):
        if file_name.endswith(".csv"):
            current_file_path = os.path.join(folder_to_check_path, file_name)
            try:
                current_dataframe = pd.read_csv(current_file_path)
                current_columns = set(current_dataframe.columns)
                current_column_count = len(current_columns)

                if current_column_count != reference_column_count:
                    diff_column_count += 1
                    files_with_diff_column_count.append(current_file_path)

                if current_columns != reference_columns:
                    diff_column_names += 1
                    files_with_diff_column_names.append(current_file_path)

            except Exception as e:
                print(f"Error loading file '{current_file_path}': {e}")
                continue

    return diff_column_count, diff_column_names, files_with_diff_column_count, files_with_diff_column_names


# Example usage:
reference_csv = 'data_ps/player_stats_Arsenal_2024.csv'  # Change to the path of your reference CSV file
target_folder = 'data_ps' # Change to the path of your folder containing CSVs to check

column_count_diffs, column_name_diffs, files_with_count_diff, files_with_name_diff = analyze_csv_column_differences(reference_csv, target_folder)

print(f"\nNumber of files in '{target_folder}' with a different column count than '{reference_csv}': {column_count_diffs}")
print(f"Number of files in '{target_folder}' with different column names than '{reference_csv}': {column_name_diffs}")

if files_with_count_diff:
    print("\nFiles with different column counts:")
    for file in files_with_count_diff:
        # remove this file
        os.remove(file)
        print(f"- {file}")

if files_with_name_diff:
    print("\nFiles with different column names:")
    for file in files_with_name_diff:
        print(f"- {file}")


Number of files in 'data_ps' with a different column count than 'data_ps/player_stats_Arsenal_2024.csv': 94
Number of files in 'data_ps' with different column names than 'data_ps/player_stats_Arsenal_2024.csv': 678

Files with different column counts:
- data_ps/player_stats_Mallorca_2020.csv
- data_ps/player_stats_Lens_2017.csv
- data_ps/player_stats_Heidenheim_2017.csv
- data_ps/player_stats_Le Havre_2022.csv
- data_ps/player_stats_Reims_2017.csv
- data_ps/player_stats_Parma_2017.csv
- data_ps/player_stats_Le Havre_2020.csv
- data_ps/player_stats_Lecce_2017.csv
- data_ps/player_stats_Le Havre_2021.csv
- data_ps/player_stats_Le Havre_2019.csv
- data_ps/player_stats_Brentford_2017.csv
- data_ps/player_stats_Monza_2018.csv
- data_ps/player_stats_Ipswich Town_2017.csv
- data_ps/player_stats_Brest_2018.csv
- data_ps/player_stats_Monza_2019.csv
- data_ps/player_stats_Le Havre_2018.csv
- data_ps/player_stats_Mallorca_2018.csv
- data_ps/player_stats_St. Pauli_2021.csv
- data_ps/player_stats_