# Compare Countries — Benin, Sierra Leone, Togo
This notebook loads cleaned CSVs, computes summary statistics (mean, median, std) for GHI/DNI/DHI,
plots boxplots side-by-side, runs statistical tests (ANOVA / Kruskal-Wallis) and produces a ranking bar chart.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats

sns.set(style="whitegrid")
DATA_DIR = Path("../data")  # adjust if notebook path differs

files = {
    'Benin': DATA_DIR / "benin_clean.csv",
    'SierraLeone': DATA_DIR / "sierraleone_clean.csv",
    'Togo': DATA_DIR / "togo_clean.csv"
}

# load CSVs into dict of dataframes
dfs = {}
for name, path in files.items():
    if not path.exists():
        raise FileNotFoundError(f"Expected {path} to exist. Place cleaned CSVs in data/ directory.")
    df = pd.read_csv(path, parse_dates=['Timestamp'], infer_datetime_format=True)
    df['country'] = name
    dfs[name] = df

# combined DF
df_all = pd.concat(dfs.values(), ignore_index=True)
df_all.shape


In [None]:
metrics = {}
for name, df in dfs.items():
    metrics[name] = {
        'GHI_mean': df['GHI'].mean() if 'GHI' in df.columns else np.nan,
        'GHI_median': df['GHI'].median() if 'GHI' in df.columns else np.nan,
        'GHI_std': df['GHI'].std() if 'GHI' in df.columns else np.nan,
        'DNI_mean': df['DNI'].mean() if 'DNI' in df.columns else np.nan,
        'DNI_median': df['DNI'].median() if 'DNI' in df.columns else np.nan,
        'DNI_std': df['DNI'].std() if 'DNI' in df.columns else np.nan,
        'DHI_mean': df['DHI'].mean() if 'DHI' in df.columns else np.nan,
        'DHI_median': df['DHI'].median() if 'DHI' in df.columns else np.nan,
        'DHI_std': df['DHI'].std() if 'DHI' in df.columns else np.nan
    }
summary_df = pd.DataFrame(metrics).T
summary_df.round(2)


In [None]:
plt.figure(figsize=(14,4))
plt.subplot(1,3,1)
sns.boxplot(x='country', y='GHI', data=df_all)
plt.title('GHI by Country')

plt.subplot(1,3,2)
sns.boxplot(x='country', y='DNI', data=df_all)
plt.title('DNI by Country')

plt.subplot(1,3,3)
sns.boxplot(x='country', y='DHI', data=df_all)
plt.title('DHI by Country')

plt.tight_layout()
plt.show()


In [None]:
ghi_means = summary_df['GHI_mean'].sort_values(ascending=False)
plt.figure(figsize=(6,3))
sns.barplot(x=ghi_means.index, y=ghi_means.values)
plt.ylabel('Mean Daily GHI (W/m²)')
plt.title('Countries ranked by mean GHI')
plt.show()


In [None]:
# Prepare GHI arrays per country
ghi_groups = [dfs[name]['GHI'].dropna().values for name in dfs.keys()]

# 1) Test normality (Shapiro) on a sample per group (Shapiro on full very large samples may be too strict)
from scipy.stats import shapiro, levene, f_oneway, kruskal

normality = {}
for name, arr in zip(dfs.keys(), ghi_groups):
    sample = arr if len(arr)<=5000 else np.random.choice(arr, size=5000, replace=False)
    stat, p = shapiro(sample)
    normality[name] = p  # p-value
normality

# 2) Levene test for equal variances
lev_stat, lev_p = levene(*ghi_groups)
lev_p


In [None]:
# If groups are approximately normal and variances are equal -> ANOVA; else Kruskal-Wallis
print("Levene p-value:", lev_p)

# ANOVA
anova_stat, anova_p = f_oneway(*ghi_groups)
print("ANOVA p-value:", anova_p)

# Kruskal-Wallis
kw_stat, kw_p = kruskal(*ghi_groups)
print("Kruskal-Wallis p-value:", kw_p)

# Interpret: small p (<0.05) means at least one group differs


### Key Observations
- Observation 1: Country Benin shows the highest median GHI but also the greatest variability.
- Observation 2: Country SierraLeone has significantly lower GHI (p = <value>) — consider...
- Observation 3: Differences are statistically significant/not significant based on the p-values (ANOVA/KW).
