In [16]:
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Tuple, List, Dict
import json
from datetime import datetime

class VisualDataFileComparator:
    def __init__(self, file1_path: str, file2_path: str):
        """Initialize the comparator with paths to two files."""
        self.file1_path = Path(file1_path)
        self.file2_path = Path(file2_path)
        self.df1 = None
        self.df2 = None
        self.comparison_results = {}

    def load_files(self) -> Tuple[bool, str]:
        """Load the data files into pandas DataFrames."""
        try:
            if self.file1_path.suffix.lower() == '.csv':
                self.df1 = pd.read_csv(self.file1_path, low_memory=False)
            elif self.file1_path.suffix.lower() in ['.xlsx', '.xls']:
                self.df1 = pd.read_excel(self.file1_path)
            else:
                return False, f"Unsupported file format for {self.file1_path}"

            if self.file2_path.suffix.lower() == '.csv':
                self.df2 = pd.read_csv(self.file2_path, low_memory=False)
            elif self.file2_path.suffix.lower() in ['.xlsx', '.xls']:
                self.df2 = pd.read_excel(self.file2_path)
            else:
                return False, f"Unsupported file format for {self.file2_path}"

            return True, "Files loaded successfully"
        except Exception as e:
            return False, f"Error loading files: {str(e)}"

    def compare_basic_info(self) -> Dict:
        """Compare basic information about the datasets."""
        if self.df1 is None or self.df2 is None:
            success, message = self.load_files()
            if not success:
                return {}

        return {
            'file1_shape': self.df1.shape,
            'file2_shape': self.df2.shape,
            'file1_columns': len(self.df1.columns),
            'file2_columns': len(self.df2.columns),
            'row_difference': abs(len(self.df1) - len(self.df2)),
            'column_difference': abs(len(self.df1.columns) - len(self.df2.columns))
        }

    def compare_columns(self) -> Dict:
        """Compare column names and data types between the two files."""
        if self.df1 is None or self.df2 is None:
            success, message = self.load_files()
            if not success:
                return {}

        cols1 = set(self.df1.columns)
        cols2 = set(self.df2.columns)
        
        return {
            'common_columns': list(cols1 & cols2),
            'unique_to_file1': list(cols1 - cols2),
            'unique_to_file2': list(cols2 - cols1),
            'file1_dtypes': self.df1.dtypes.to_dict(),
            'file2_dtypes': self.df2.dtypes.to_dict()
        }

    def calculate_data_similarity(self) -> Dict:
        """Calculate what percentage of data is identical between the two files."""
        if self.df1 is None or self.df2 is None:
            success, message = self.load_files()
            if not success:
                return {'overall_similarity': 0, 'column_similarities': {}}

        similarity_stats = {}
        common_columns = set(self.df1.columns) & set(self.df2.columns)
        total_cells = 0
        matching_cells = 0
        
        for col in common_columns:
            min_length = min(len(self.df1), len(self.df2))
            vals1 = self.df1[col].iloc[:min_length]
            vals2 = self.df2[col].iloc[:min_length]
            
            if vals1.dtype != vals2.dtype:
                vals1 = vals1.astype(str)
                vals2 = vals2.astype(str)
            
            matching_mask = (vals1 == vals2) | (pd.isna(vals1) & pd.isna(vals2))
            matches = matching_mask.sum()
            total = len(vals1)
            
            matching_cells += matches
            total_cells += total
            
            match_percentage = (matches / total) * 100 if total > 0 else 0
            similarity_stats[col] = {
                'match_percentage': match_percentage,
                'matching_values': int(matches),
                'total_values': total
            }
        
        overall_similarity = (matching_cells / total_cells * 100) if total_cells > 0 else 0
        
        return {
            'overall_similarity': overall_similarity,
            'column_similarities': similarity_stats
        }

    def generate_visualizations(self) -> dict:
        """Generate JSON data for visualizations"""
        similarity_stats = self.calculate_data_similarity()
        basic_info = self.compare_basic_info()
        
        # Prepare data for column similarity chart
        column_similarities = similarity_stats['column_similarities']
        columns = list(column_similarities.keys())
        percentages = [stats['match_percentage'] for stats in column_similarities.values()]
        
        # Create bar chart data
        bar_chart_data = {
            'columns': columns,
            'percentages': percentages
        }
        
        # Create gauge chart data
        gauge_data = {
            'overall_similarity': similarity_stats['overall_similarity']
        }

        # Create row count comparison data
        row_comparison_data = {
            'labels': ['File 1', 'File 2'],
            'values': [basic_info['file1_shape'][0], basic_info['file2_shape'][0]]
        }

        # Create column count comparison data
        column_comparison_data = {
            'labels': ['Common Columns', 'Unique to File 1', 'Unique to File 2'],
            'values': [
                len(set(self.df1.columns) & set(self.df2.columns)),
                len(set(self.df1.columns) - set(self.df2.columns)),
                len(set(self.df2.columns) - set(self.df1.columns))
            ]
        }
        
        return {
            'bar_chart': bar_chart_data,
            'gauge': gauge_data,
            'row_comparison': row_comparison_data,
            'column_comparison': column_comparison_data,
            'timestamp': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }

    def generate_html_report(self, output_path: str = "comparison_report.html"):
        """Generate an HTML report with visualizations"""
        viz_data = self.generate_visualizations()
        basic_info = self.compare_basic_info()
        json_data = json.dumps(viz_data)
        html_content = self._get_html_template(json_data, basic_info)
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(html_content)
        
        return output_path

    def _get_html_template(self, json_data: str, basic_info: dict) -> str:
        """Return the HTML template with embedded data"""
        return f"""
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Data File Comparison Report</title>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/plotly.js/2.24.2/plotly.min.js"></script>
    <script src="https://cdn.tailwindcss.com"></script>
    <style>
        .chart-container {{
            width: 100%;
            min-height: 300px;
            margin: 1rem 0;
        }}
        .card {{
            background: white;
            border-radius: 8px;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
            margin: 16px 0;
            padding: 16px;
        }}
        @media (max-width: 768px) {{
            .chart-container {{
                height: 300px;
            }}
        }}
    </style>
</head>
<body class="bg-gray-100">
    <div class="container mx-auto px-4 py-8 max-w-7xl">
        <div class="text-center mb-8">
            <h1 class="text-3xl font-bold text-gray-800 mb-2">Data File Comparison Report</h1>
            <p class="text-gray-600">Generated on <span id="timestamp"></span></p>
        </div>

        <div class="grid grid-cols-1 lg:grid-cols-2 gap-6">
            <div class="card">
                <h2 class="text-xl font-semibold mb-4">File Information</h2>
                <div class="grid grid-cols-1 md:grid-cols-2 gap-4">
                    <div class="p-4 bg-gray-50 rounded">
                        <h3 class="font-medium text-blue-600">File 1:</h3>
                        <p class="text-sm text-gray-600 break-all">{self.file1_path}</p>
                        <p class="mt-2">Rows: {basic_info['file1_shape'][0]}</p>
                        <p>Columns: {basic_info['file1_shape'][1]}</p>
                    </div>
                    <div class="p-4 bg-gray-50 rounded">
                        <h3 class="font-medium text-blue-600">File 2:</h3>
                        <p class="text-sm text-gray-600 break-all">{self.file2_path}</p>
                        <p class="mt-2">Rows: {basic_info['file2_shape'][0]}</p>
                        <p>Columns: {basic_info['file2_shape'][1]}</p>
                    </div>
                </div>
            </div>

            <div class="card">
                <h2 class="text-xl font-semibold mb-4">Overall Similarity</h2>
                <div id="gauge-chart" class="chart-container"></div>
            </div>
        </div>

        <div class="grid grid-cols-1 lg:grid-cols-2 gap-6 mt-6">
            <div class="card">
                <h2 class="text-xl font-semibold mb-4">Row Comparison</h2>
                <div id="row-comparison-chart" class="chart-container"></div>
            </div>

            <div class="card">
                <h2 class="text-xl font-semibold mb-4">Column Distribution</h2>
                <div id="column-comparison-chart" class="chart-container"></div>
            </div>
        </div>

        <div class="card mt-6">
            <h2 class="text-xl font-semibold mb-4">Column-wise Similarity</h2>
            <div id="bar-chart" class="chart-container h-[500px]"></div>
        </div>
    </div>

    <script>
        // Parse the data passed from Python
        const data = {json_data};
        document.getElementById('timestamp').textContent = data.timestamp;

        // Responsive layout helper
        const getResponsiveLayout = () => {{
            return {{
                paper_bgcolor: 'rgba(0,0,0,0)',
                plot_bgcolor: 'rgba(0,0,0,0)',
                margin: {{ t: 40, r: 30, l: 30, b: 40 }},
                font: {{ size: window.innerWidth < 768 ? 10 : 12 }}
            }};
        }};

        // Create gauge chart
        const gauge = {{
            type: "indicator",
            mode: "gauge+number",
            value: data.gauge.overall_similarity,
            title: {{ text: "Overall Similarity %" }},
            gauge: {{
                axis: {{ range: [0, 100] }},
                bar: {{ color: "darkblue" }},
                bgcolor: "white",
                borderwidth: 2,
                bordercolor: "gray",
                steps: [
                    {{ range: [0, 25], color: "#ff9999" }},
                    {{ range: [25, 50], color: "#ffcc99" }},
                    {{ range: [50, 75], color: "#99ff99" }},
                    {{ range: [75, 100], color: "#99ccff" }}
                ]
            }}
        }};

        // Create row comparison chart
        const rowComparison = {{
            type: 'bar',
            x: data.row_comparison.labels,
            y: data.row_comparison.values,
            marker: {{
                color: ['#4299e1', '#48bb78']
            }},
            text: data.row_comparison.values.map(String),
            textposition: 'auto'
        }};

        // Create column comparison pie chart
        const columnComparison = {{
            type: 'pie',
            labels: data.column_comparison.labels,
            values: data.column_comparison.values,
            marker: {{
                colors: ['#4299e1', '#48bb78', '#ed8936']
            }}
        }};

        // Create column similarity bar chart
        const barChart = {{
            type: 'bar',
            x: data.bar_chart.columns,
            y: data.bar_chart.percentages,
            marker: {{
                color: data.bar_chart.percentages.map(p => 
                    `rgb(${{Math.round(255 * (100 - p) / 100)}}, ${{Math.round(255 * (100 - p) / 100)}}, 255)`
                )
            }}
        }};

        // Plot all charts
        Plotly.newPlot('gauge-chart', [gauge], getResponsiveLayout());
        Plotly.newPlot('row-comparison-chart', [rowComparison], {{
            ...getResponsiveLayout(),
            title: 'Number of Rows per File',
            yaxis: {{ title: 'Row Count' }}
        }});
        Plotly.newPlot('column-comparison-chart', [columnComparison], {{
            ...getResponsiveLayout(),
            title: 'Column Distribution'
        }});
        Plotly.newPlot('bar-chart', [barChart], {{
            ...getResponsiveLayout(),
            title: 'Column Similarity Percentages',
            xaxis: {{ 
                title: 'Columns',
                tickangle: 45,
                automargin: true
            }},
            yaxis: {{ 
                title: 'Similarity %',
                range: [0, 100]
            }}
        }});

        // Make charts responsive
        window.addEventListener('resize', () => {{
            const layout = getResponsiveLayout();
            Plotly.relayout('gauge-chart', layout);
            Plotly.relayout('row-comparison-chart', layout);
            Plotly.relayout('column-comparison-chart', layout);
            Plotly.relayout('bar-chart', layout);
        }});
    </script>
</body>
</html>
        """

def main():
    """Main function to run the comparison tool."""
    try:
        # Replace these with your actual file paths
        file1_path = r"Data\Vehicles\RoadSafetyData_Vehicles_2015\Vehicles_2015.csv"
        file2_path = r"Data\RoadSafetyData_2015\Vehicles_2015.csv"

        comparator = VisualDataFileComparator(file1_path, file2_path)
        report_path = comparator.generate_html_report("comparison_report.html")
        print(f"Report successfully generated at: {report_path}")
    except Exception as e:
        print(f"Error generating report: {str(e)}")

if __name__ == "__main__":
    main()


# Replace with your actual file paths
    file1_path = r"Data\Vehicles\RoadSafetyData_Vehicles_2015\Vehicles_2015.csv"
    file2_path = r"Data\RoadSafetyData_2015\Vehicles_2015.csv"

comparator = VisualDataFileComparator(file1_path, file2_path)
report_path = comparator.generate_html_report("comparison_report.html")
print(f"Report generated at: {report_path}")


Report successfully generated at: comparison_report.html
Report generated at: comparison_report.html
