In [22]:
import os
import pandas as pd
import shutil

# Define folder paths
folder_A = "Folder_A"
bad_data_folder = "Folder_B"
good_data_folder = "Folder_C"

# Create directories
os.makedirs(good_data_folder, exist_ok=True)
os.makedirs(bad_data_folder, exist_ok=True)

def check_errors(df, file):
    """
    Check if the dataset contains any introduced errors.
    Returns True if errors are found, otherwise False.
    """
    error_found = False

    # Check for blank price values in SpiceJet
    if df[(df['airline'] == 'SpiceJet') & (df['price'].isna())].shape[0] > 0:
        error_found = True
        print(f"[ERROR] {file}: Blank price values in SpiceJet.")
    
    # Check for negative days_left values
    if df[df['days_left'] < 0].shape[0] > 0:
        error_found = True
        print(f"[ERROR] {file}: Negative 'days_left' values.")
    
    # Check for same source and destination cities
    if df[df['source_city'] == df['destination_city']].shape[0] > 0:
        error_found = True
        print(f"[ERROR] {file}: Same source and destination cities.")
    
    # Check for Yes/No in duration
    if df[df['duration'].astype(str).isin(['Yes', 'No'])].shape[0] > 0:
        error_found = True
        print(f"[ERROR] {file}: Invalid 'duration' values (Yes/No).")
    
    # Check for Premium class
    if df[df['class'] == 'Premium'].shape[0] > 0:
        error_found = True
        print(f"[ERROR] {file}: Premium class found.")
    
    # Check for Air India flights with Vistara flight numbers
    # Ensure 'flight' column is treated as a string to prevent errors
    df['flight'] = df['flight'].astype(str)
    if df[(df['airline'] == 'Air_India') & (df['flight'].str.startswith('UK-'))].shape[0] > 0:
        error_found = True
        print(f"[ERROR] {file}: Air India flights with Vistara flight numbers.")
    
    # Check for zero stops with duration > 20 hours
    temp_duration = pd.to_numeric(df['duration'], errors='coerce')  # Convert to numeric
    if df[(df['stops'] == 'zero') & (temp_duration > 20)].shape[0] > 0:
        error_found = True
        print(f"[ERROR] {file}: Zero stops with duration > 20 hours.")

    return error_found

# Get the list of visible .csv files in Folder_A, excluding hidden files
files = [f for f in os.listdir(folder_A) if f.endswith(".csv") and not f.startswith(".")]

# Process each file and track the results
bad_data_count = 0
good_data_count = 0
skipped_files = []

# Start processing each file
for file in files:
    file_path = os.path.join(folder_A, file)
    
    try:
        # Attempt to read the CSV file
        df = pd.read_csv(file_path)

        # Check for errors in the data
        if check_errors(df, file):
            # Copy the file to bad_data folder
            shutil.copy(file_path, os.path.join(bad_data_folder, file))
            bad_data_count += 1
            print(f"{file} is copied to bad_data.")
        else:
            # Copy the file to good_data folder
            shutil.copy(file_path, os.path.join(good_data_folder, file))
            good_data_count += 1
            print(f"{file} is copied to good_data.")

    except Exception as e:
        # Log the error and skip the file
        skipped_files.append(file)
        print(f"[ERROR] Could not process {file}: {e}")

# Final summary
print(f"\nTotal files processed: {len(files)}")
print(f"Files copied to bad_data: {bad_data_count}")
print(f"Files copied to good_data: {good_data_count}")
print(f"Files skipped due to errors: {len(skipped_files)}")

[ERROR] split_28768.csv: Invalid 'duration' values (Yes/No).
split_28768.csv is copied to bad_data.
split_8289.csv is copied to good_data.
split_20334.csv is copied to good_data.
split_17598.csv is copied to good_data.
split_16686.csv is copied to good_data.
split_29476.csv is copied to good_data.
split_26745.csv is copied to good_data.
split_18873.csv is copied to good_data.
split_9197.csv is copied to good_data.
split_13926.csv is copied to good_data.
split_22523.csv is copied to good_data.
split_14091.csv is copied to good_data.
split_5975.csv is copied to good_data.
[ERROR] split_24152.csv: Premium class found.
split_24152.csv is copied to bad_data.
split_24634.csv is copied to good_data.
split_13098.csv is copied to good_data.
split_22245.csv is copied to good_data.
split_12386.csv is copied to good_data.
split_29310.csv is copied to good_data.
[ERROR] split_26023.csv: Negative 'days_left' values.
split_26023.csv is copied to bad_data.
split_7804.csv is copied to good_data.
[ERROR