In [48]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import f_oneway, kruskal
import os

In [42]:
# compare_countries.ipynb

def load_cleaned_data():
    """
    Loads cleaned CSV files for Benin, Sierra Leone, and Togo, and adds a 'Country' column.
    Returns:
        combined_df (DataFrame): Combined DataFrame with all countries' data
        individual_dfs (dict): Dictionary of DataFrames for each country
    """
    # Define file paths
    data_dir = '../data/'
    countries = {
        'Benin': 'benin_clean.csv',
        'Sierra Leone': 'sierra_leone_clean.csv',
        'Togo': 'togo_clean.csv'
    }
    
    # Initialize lists and dictionary
    dfs = []
    individual_dfs = {}
    
    # Load each CSV and add 'Country' column
    for country, file in countries.items():
        file_path = os.path.join(data_dir, file)
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"File {file_path} not found. Ensure CSV files are in the 'data/' directory.")
        df = pd.read_csv(file_path)
        df['Country'] = country
        dfs.append(df)
        individual_dfs[country] = df
        print(f"Loaded {file} with {len(df)} rows and {len(df.columns)} columns.")
    
    # Combine all DataFrames
    combined_df = pd.concat(dfs, ignore_index=True)
    print(f"Combined DataFrame created with {len(combined_df)} rows and {len(combined_df.columns)} columns.")
    
    return combined_df, individual_dfs

# Main execution
if __name__ == "__main__":
    print("Loading cleaned data...")
    combined_df, individual_dfs = load_cleaned_data()
    
    # Optional: Display first few rows of combined DataFrame for verification
    print("\nFirst 5 rows of combined DataFrame:")
    print(combined_df.head())
    
    # Optional: Display column names for verification
    print("\nColumns in combined DataFrame:")
    print(list(combined_df.columns))

Loading cleaned data...
Loaded benin_clean.csv with 525600 rows and 20 columns.
Loaded sierra_leone_clean.csv with 517548 rows and 20 columns.
Loaded togo_clean.csv with 525600 rows and 20 columns.
Combined DataFrame created with 1568748 rows and 20 columns.

First 5 rows of combined DataFrame:
          Timestamp  GHI  DNI  DHI  ModA  ModB  Tamb    RH   WS  WSgust  \
0  2021-08-09 00:01 -1.2 -0.2 -1.1   0.0   0.0  26.2  93.4  0.0     0.4   
1  2021-08-09 00:02 -1.1 -0.2 -1.1   0.0   0.0  26.2  93.6  0.0     0.0   
2  2021-08-09 00:03 -1.1 -0.2 -1.1   0.0   0.0  26.2  93.7  0.3     1.1   
3  2021-08-09 00:04 -1.1 -0.1 -1.0   0.0   0.0  26.2  93.3  0.2     0.7   
4  2021-08-09 00:05 -1.0 -0.1 -1.0   0.0   0.0  26.2  93.3  0.1     0.7   

   WSstdev     WD  WDstdev   BP  Cleaning  Precipitation  TModA  TModB  \
0      0.1  122.1      0.0  998         0            0.0   26.3   26.2   
1      0.0    0.0      0.0  998         0            0.0   26.3   26.2   
2      0.5  124.6      1.5  997

In [44]:


def plot_boxplots_by_country(df, metrics, save_path=None):
    """
    Plot boxplots of the given metrics by country and optionally save them.

    Parameters:
    - df (pd.DataFrame): Combined DataFrame that includes a 'Country' column.
    - metrics (list): List of column names (metrics) to plot.
    - save_path (str): Directory path where plots will be saved (optional).
    """
    # Define a fixed color mapping
    country_colors = {
        "Benin": "red",
        "Togo": "green",
        "Sierra Leone": "yellow"
    }

    for metric in metrics:
        plt.figure(figsize=(8, 5))
        sns.boxplot(data=df, x="Country", y=metric, hue="Country", palette=country_colors, dodge=False)
        plt.title(f"{metric} Comparison by Country", fontsize=14)
        plt.xlabel("Country", fontsize=12)
        plt.ylabel(f"{metric} (W/m²)", fontsize=12)
        plt.grid(True, linestyle='--', alpha=0.5)
        plt.legend([], [], frameon=False)  # hide redundant legend
        plt.tight_layout()
        
        if save_path:
            os.makedirs(save_path, exist_ok=True)
            file_name = f"{metric}_boxplot.png"
            full_path = os.path.join(save_path, file_name)
            plt.savefig(full_path)
            print(f"Saved plot: {full_path}")
            plt.close()
        else:
            plt.show()
metrics = ["GHI", "DNI", "DHI"]
plot_boxplots_by_country(df_all, metrics, save_path="figures/compare-countries")


Saved plot: figures/compare-countries\GHI_boxplot.png
Saved plot: figures/compare-countries\DNI_boxplot.png
Saved plot: figures/compare-countries\DHI_boxplot.png


In [51]:
import pandas as pd

# Load cleaned datasets
benin_df = pd.read_csv('../data/benin_clean.csv')
sierra_leone_df = pd.read_csv('../data/sierra_leone_clean.csv')
togo_df = pd.read_csv('../data/togo_clean.csv')

# Function to compute summary stats
def get_summary(df, country_name):
    summary = {
        'Country': country_name,
        'GHI Mean': df['GHI'].mean(),
        'GHI Median': df['GHI'].median(),
        'GHI Std': df['GHI'].std(),
        'DNI Mean': df['DNI'].mean(),
        'DNI Median': df['DNI'].median(),
        'DNI Std': df['DNI'].std(),
        'DHI Mean': df['DHI'].mean(),
        'DHI Median': df['DHI'].median(),
        'DHI Std': df['DHI'].std(),
    }
    return summary

# Create summary table
summary_data = [
    get_summary(benin_df, 'Benin'),
    get_summary(sierra_leone_df, 'Sierra Leone'),
    get_summary(togo_df, 'Togo')
]

summary_table = pd.DataFrame(summary_data)

# Display the table
print(summary_table)


        Country    GHI Mean  GHI Median     GHI Std    DNI Mean  DNI Median  \
0         Benin  239.984991         1.8  329.682868  166.933602        -0.1   
1  Sierra Leone  189.992949        -0.3  284.058730  105.465592        -0.1   
2          Togo  229.854439         2.1  320.681468  150.903375         0.0   

      DNI Std    DHI Mean  DHI Median     DHI Std  
0  261.019689  114.963522         1.6  157.429755  
1  201.874948  111.373794        -0.6  157.502952  
2  249.925137  116.006967         2.5  155.169413  


In [52]:


def run_statistical_tests(df, column="GHI"):
    """
    Run one-way ANOVA and Kruskal–Wallis tests on a specified column grouped by Country.

    Parameters:
    - df (pd.DataFrame): Input DataFrame with 'Country' and target metric column.
    - column (str): The column to test.

    Returns:
    - dict: p-values from ANOVA and Kruskal–Wallis.
    """
    # Split data by country
    grouped_data = [group[column].dropna().values for name, group in df.groupby("Country")]

    # Run tests
    anova_result = f_oneway(*grouped_data)
    kruskal_result = kruskal(*grouped_data)

    return {
        "ANOVA p-value": anova_result.pvalue,
        "Kruskal-Wallis p-value": kruskal_result.pvalue
    }

# Run test on GHI
p_values = run_statistical_tests(df_all, column="GHI")
print("Statistical Test Results:")
for test, p in p_values.items():
    print(f"{test}: {p:.5f}")


Statistical Test Results:
ANOVA p-value: 0.00000
Kruskal-Wallis p-value: 0.00000


In [53]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_avg_ghi_bar_chart(df, save_path=None):
    """
    Plot a bar chart ranking countries by average GHI.

    Parameters:
    - df (pd.DataFrame): DataFrame that includes 'Country' and 'GHI' columns.
    - save_path (str): Optional path to save the plot as a PNG.
    """
    avg_ghi = df.groupby("Country")["GHI"].mean().sort_values(ascending=False)

    # Define custom colors matching the countries
    color_mapping = {
        "Benin": "red",
        "Togo": "green",
        "Sierra Leone": "yellow"
    }
    colors = [color_mapping[country] for country in avg_ghi.index]

    plt.figure(figsize=(6, 4))
    bars = plt.bar(avg_ghi.index, avg_ghi.values, color=colors)
    plt.title("Average GHI by Country", fontsize=14)
    plt.ylabel("Average GHI (W/m²)", fontsize=12)
    plt.xlabel("Country", fontsize=12)
    plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout()

    if save_path:
        plt.savefig(save_path)
        print(f"Saved plot: {save_path}")
        plt.close()
    else:
        plt.show()

# Example usage:
plot_avg_ghi_bar_chart(df_all, save_path="figures/compare-countries/avg_ghi_bar_chart.png")


Saved plot: figures/compare-countries/avg_ghi_bar_chart.png
