In [None]:
import pandas as pd
import os

# Specify the folder containing the CSV files
folder_path = r'data/1- Raw Data/STMT'  # Replace with the actual path to your folder
output_file = os.path.join(folder_path, 'full_stmt_dataset.csv')  # Final output file

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Initialize an empty list to store DataFrames
dataframes = []

# Loop through each CSV file
for file in csv_files:
    # Extract the month from the filename (assuming format: "Month ...")
    month = file.split(" ")[0]  # Get the first word in the filename, which is the month
    
    # Read the CSV file into a DataFrame with error handling
    file_path = os.path.join(folder_path, file)
    try:
        df = pd.read_csv(file_path, skiprows=3, delimiter=',', encoding='utf-8', skip_blank_lines=True)
        
        # Add a new column for the month (in French)
        df.insert(0, 'Mois', month)  # Use 'Mois' as the column name
        
        # Append the DataFrame to the list
        dataframes.append(df)
    except Exception as e:
        print(f"Error reading {file}: {e}")

# Combine all DataFrames into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Clean column names by stripping whitespace and removing special characters
combined_df.columns = combined_df.columns.str.strip().str.replace(r'\s+', ' ', regex=True)

# Rename specific columns for easier processing
if 'Mois,Commune de plus de 5000 hab.' in combined_df.columns:
    combined_df.rename(columns={'Mois,Commune de plus de 5000 hab.': 'Mois_Commune'}, inplace=True)

# Split 'Mois_Commune' into two separate columns: 'Mois' and 'Commune de plus de 5000 hab'
if 'Mois_Commune' in combined_df.columns:
    combined_df[['Mois', 'Commune de plus de 5000 hab']] = combined_df['Mois_Commune'].str.split(',', n=1, expand=True)
    combined_df.drop(columns=['Mois_Commune'], inplace=True)

# Clean the "Total" column by removing trailing commas and converting to numeric
if 'Total' in combined_df.columns:
    combined_df['Total'] = combined_df['Total'].str.rstrip(',').str.strip()

# Save the final cleaned dataset to a CSV file
combined_df.to_csv(output_file, index=False, encoding='utf-8')
print(f"Final cleaned dataset saved at: {output_file}")


In [None]:
import pandas as pd

# Define file paths
input_file_path = "data/1- Raw Data/STMT/full_stmt_dataset.csv"
output_file_path = "data/1- Raw Data/STMT/full_stmt_dataset_cleaned.csv"

# Load the CSV file
df = pd.read_csv(input_file_path)

# Split the "Commune de plus de 5000 hab" column into two new columns
df[['Commune de plus de 5000 hab', 'Postal Code']] = df['Commune de plus de 5000 hab'].str.rsplit(' ', n=1, expand=True)

# Save the updated DataFrame to a new CSV file
df.to_csv(output_file_path, index=False)

print(f"File has been updated and saved to: {output_file_path}")
