In [10]:
import pandas as pd
from scipy.stats import zscore

# Define file paths for the saved datasets
file_paths = {
    'benin-malanville': 'benin-malanville_data.csv',
    'sierraleone-bumbuna': 'sierraleone-bumbuna_data.csv',
    'togo-dapaong_qc': 'togo-dapaong_qc_data.csv'
}

class DataQualityChecker:
    def __init__(self, file_paths):
        self.data = {country: pd.read_csv(path) for country, path in file_paths.items()}
    
    def calculate_summary_statistics(self):
        for country, df in self.data.items():
            print(f"Summary statistics for {country}:")
            
            # Select only numeric columns
            numeric_cols = df.select_dtypes(include='number')
            
            # Calculate summary statistics
            summary_stats = numeric_cols.describe().T
            summary_stats['median'] = numeric_cols.median()
            summary_stats['std_dev'] = numeric_cols.std()
            
            print(summary_stats)
            print("\n")
    
    def check_data_quality(self):
        for country, df in self.data.items():
            print(f"Data quality check for {country}:")
            missing_values = df.isnull().sum()
            print(f"Missing values:\n{missing_values[missing_values > 0]}")
            print("\n")
            
            critical_columns = ['GHI', 'DNI', 'DHI']
            z_scores = df[critical_columns].apply(zscore)
            outliers = z_scores[(z_scores > 3) | (z_scores < -3)].dropna()
            print(f"Outliers:\n{outliers}")
            print("\n")
            
            incorrect_entries = df[(df['GHI'] < 0) | (df['DNI'] < 0) | (df['DHI'] < 0)]
            print(f"Incorrect entries:\n{incorrect_entries}")
            print("\n")

# Create an instance of DataQualityChecker
checker = DataQualityChecker(file_paths)

# Calculate summary statistics and perform data quality check
checker.calculate_summary_statistics()
checker.check_data_quality()


Summary statistics for benin-malanville:
                  count        mean         std    min    25%    50%    75%  \
GHI            525600.0  240.559452  331.131327  -12.9   -2.0    1.8  483.4   
DNI            525600.0  167.187516  261.710501   -7.8   -0.5   -0.1  314.2   
DHI            525600.0  115.358961  158.691074  -12.6   -2.1    1.6  216.3   
ModA           525600.0  236.589496  326.894859    0.0    0.0    4.5  463.7   
ModB           525600.0  228.883576  316.536515    0.0    0.0    4.3  447.9   
Tamb           525600.0   28.179683    5.924297   11.0   24.2   28.0   32.3   
RH             525600.0   54.487969   28.073069    2.1   28.8   55.1   80.1   
WS             525600.0    2.121113    1.603466    0.0    1.0    1.9    3.1   
WSgust         525600.0    2.809195    2.029120    0.0    1.3    2.6    4.1   
WSstdev        525600.0    0.473390    0.273395    0.0    0.4    0.5    0.6   
WD             525600.0  153.435172  102.332842    0.0   59.0  181.0  235.1   
WDstdev    