In [3]:
import pandas as pd

def remove_rows_with_N_size(input_file, output_file):
    """
    Removes rows from the CSV where 'Size' is equal to 'N' and prints how many rows were removed.

    Parameters:
    - input_file (str): The file path for the input CSV file.
    - output_file (str): The file path for the output cleaned CSV file.

    Returns:
    - df (pd.DataFrame): The DataFrame with rows removed where 'Size' is 'N'.
    """
    # Load the CSV
    df = pd.read_csv(input_file)

    # Keep track of the initial number of rows
    initial_row_count = df.shape[0]

    # Remove rows where 'Size' is 'N'
    df = df[df['Size'] != 'N']

    # Calculate how many rows were removed
    rows_removed = initial_row_count - df.shape[0]

    # Print how many rows were removed
    print(f"Number of rows removed due to 'N' size: {rows_removed}")

    return df

def remove_duplicate_rows(df):
    """
    Removes duplicate rows from the DataFrame based on identical values in columns 
    except for 'Listing ID', 'Date Posted', 'Title', and 'URL'. 
    Reports the number of rows removed.

    Parameters:
    - df (pd.DataFrame): The input DataFrame from which to remove duplicates.

    Returns:
    - df (pd.DataFrame): The cleaned DataFrame with duplicates removed.
    - rows_removed (int): The number of rows removed.
    """
    # Keep a copy of the original number of rows for comparison
    original_row_count = df.shape[0]

    # List of columns to check for duplicates, excluding 'Listing ID', 'Date Posted', 'Title', and 'URL'
    columns_to_check = df.columns.difference(['Listing ID', 'Date Posted', 'Title', 'URL'])

    # Drop duplicates based on the specified columns
    df = df.drop_duplicates(subset=columns_to_check)

    # Calculate how many rows were removed
    rows_removed = original_row_count - df.shape[0]

    return df, rows_removed

# Example usage:
input_file = r'C:\Users\17809\Projects\data\scraped_data_2024-10-22.csv'
output_file = r'C:\Users\17809\Projects\data\cleaned_scraped_data_2024-10-22.csv'

# Step 1: Remove rows where 'Size' is 'N'
cleaned_df = remove_rows_with_N_size(input_file, output_file)

# Step 2: Remove duplicate rows
deduped_df, rows_removed = remove_duplicate_rows(cleaned_df)

# Save the cleaned and deduplicated data to a new CSV
deduped_df.to_csv(output_file, index=False)

# Output the number of rows removed during deduplication
print(f"Number of duplicate rows removed: {rows_removed}")

Number of rows removed due to 'N' size: 65
Number of duplicate rows removed: 25
