In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Tuple, List, Dict

class DataFileComparator:
    def __init__(self, file1_path: str, file2_path: str):
        """Initialize the comparator with paths to two files."""
        self.file1_path = Path(file1_path)
        self.file2_path = Path(file2_path)
        self.df1 = None
        self.df2 = None
        self.comparison_results = {}

    def load_files(self) -> Tuple[bool, str]:
        """Load the data files into pandas DataFrames."""
        try:
            if self.file1_path.suffix.lower() == '.csv':
                self.df1 = pd.read_csv(self.file1_path)
            elif self.file1_path.suffix.lower() in ['.xlsx', '.xls']:
                self.df1 = pd.read_excel(self.file1_path)
            else:
                return False, f"Unsupported file format for {self.file1_path}"

            if self.file2_path.suffix.lower() == '.csv':
                self.df2 = pd.read_csv(self.file2_path)
            elif self.file2_path.suffix.lower() in ['.xlsx', '.xls']:
                self.df2 = pd.read_excel(self.file2_path)
            else:
                return False, f"Unsupported file format for {self.file2_path}"

            return True, "Files loaded successfully"
        except Exception as e:
            return False, f"Error loading files: {str(e)}"

    def compare_basic_info(self) -> Dict:
        """Compare basic information about the datasets."""
        return {
            'file1_shape': self.df1.shape,
            'file2_shape': self.df2.shape,
            'file1_columns': len(self.df1.columns),
            'file2_columns': len(self.df2.columns),
            'row_difference': abs(len(self.df1) - len(self.df2)),
            'column_difference': abs(len(self.df1.columns) - len(self.df2.columns))
        }

    def compare_columns(self) -> Dict:
        """Compare column names and data types between the two files."""
        cols1 = set(self.df1.columns)
        cols2 = set(self.df2.columns)
        
        return {
            'common_columns': list(cols1 & cols2),
            'unique_to_file1': list(cols1 - cols2),
            'unique_to_file2': list(cols2 - cols1),
            'file1_dtypes': self.df1.dtypes.to_dict(),
            'file2_dtypes': self.df2.dtypes.to_dict()
        }

    def compare_data_stats(self) -> Dict:
        """Compare basic statistics for numeric columns in common."""
        common_numeric_cols = [col for col in self.df1.columns if 
                             col in self.df2.columns and 
                             np.issubdtype(self.df1[col].dtype, np.number) and
                             np.issubdtype(self.df2[col].dtype, np.number)]
        
        stats = {}
        for col in common_numeric_cols:
            stats[col] = {
                'file1_mean': self.df1[col].mean(),
                'file2_mean': self.df2[col].mean(),
                'file1_std': self.df1[col].std(),
                'file2_std': self.df2[col].std(),
                'file1_null': self.df1[col].isnull().sum(),
                'file2_null': self.df2[col].isnull().sum()
            }
        return stats

    def compare_categorical_cols(self) -> Dict:
        """Compare unique values in categorical columns."""
        common_cat_cols = [col for col in self.df1.columns if 
                          col in self.df2.columns and 
                          (self.df1[col].dtype == 'object' or 
                           self.df2[col].dtype == 'object')]
        
        cat_comparisons = {}
        for col in common_cat_cols:
            unique1 = set(self.df1[col].dropna().unique())
            unique2 = set(self.df2[col].dropna().unique())
            cat_comparisons[col] = {
                'unique_to_file1': list(unique1 - unique2),
                'unique_to_file2': list(unique2 - unique1),
                'common_values': list(unique1 & unique2)
            }
        return cat_comparisons

    def generate_report(self) -> str:
        """Generate a formatted report of all comparisons."""
        success, message = self.load_files()
        if not success:
            return f"Error: {message}"

        basic_info = self.compare_basic_info()
        column_info = self.compare_columns()
        data_stats = self.compare_data_stats()
        cat_stats = self.compare_categorical_cols()

        report = []
        report.append("=== Data File Comparison Report ===\n")
        
        # Basic Information
        report.append("Basic Information:")
        report.append(f"File 1: {self.file1_path}")
        report.append(f"File 2: {self.file2_path}")
        report.append(f"Rows: {basic_info['file1_shape'][0]} vs {basic_info['file2_shape'][0]}")
        report.append(f"Columns: {basic_info['file1_shape'][1]} vs {basic_info['file2_shape'][1]}\n")

        # Column Comparison
        report.append("Column Analysis:")
        report.append(f"Columns unique to file 1: {', '.join(column_info['unique_to_file1'])}")
        report.append(f"Columns unique to file 2: {', '.join(column_info['unique_to_file2'])}\n")

        # Data Type Differences
        report.append("Data Type Differences:")
        common_cols = column_info['common_columns']
        for col in common_cols:
            if column_info['file1_dtypes'][col] != column_info['file2_dtypes'][col]:
                report.append(f"{col}: {column_info['file1_dtypes'][col]} vs {column_info['file2_dtypes'][col]}")
        report.append("")

        # Numeric Column Statistics
        report.append("Numeric Column Statistics:")
        for col, stats in data_stats.items():
            report.append(f"\n{col}:")
            report.append(f"  Mean: {stats['file1_mean']:.2f} vs {stats['file2_mean']:.2f}")
            report.append(f"  Std Dev: {stats['file1_std']:.2f} vs {stats['file2_std']:.2f}")
            report.append(f"  Null Count: {stats['file1_null']} vs {stats['file2_null']}")

        return "\n".join(report)

# Example usage in Jupyter notebook:
# Replace these paths with your actual file paths
file1_path = "Data\RoadSafetyData_2015\Accidents_2015.csv"
file2_path = "Data\Accidents\RoadSafetyData_Accidents_2015\Accidents_2015.csv"

comparator = DataFileComparator(file1_path, file2_path)
print(comparator.generate_report())

Error: Error loading files: [Errno 2] No such file or directory: 'path\\to\\your\\first\\file.csv'
