# EDA_4 - Removing the unplanned-generated duplications

In [None]:
# The preceding operation led to an unrealistic increase in the size of the final appended table. A subsequent root cause -
# analysis revealed a significant amount of duplication. Therefore, it was necessary to remove records where all values in -
# the duplicates were identical, retaining only those rows where a user performed multiple operations

In [None]:
import os
import pandas as pd
from pathlib import Path

# 1. Define paths
source_dir = r"C:\Users\user\Desktop\.....\PY_10_PYClean_Source_V7"
output_dir = r"C:\Users\user\Desktop\.....\PY_10_PYClean_Source_V10"

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# 2. Process of files
for filename in os.listdir(source_dir):
    if filename.endswith(".csv"):
        file_path = os.path.join(source_dir, filename)
        df = pd.read_csv(file_path, header=None, dtype=str)  # every column is string

        # 3. Remove consecutive duplicate rows
        df_cleaned = df.loc[~(df.shift(1) == df).all(axis=1)]

        # 4. Save to new fplder
        output_path = os.path.join(output_dir, filename)
        df_cleaned.to_csv(output_path, index=False, header=False, encoding='utf-8-sig')

        print(f"{filename} processed and saved.")

print("✅ All files processed and saved in V10 folder.")
