In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import f_oneway

In [None]:
benin = pd.read_csv("../data/benin_clean.csv")
togo = pd.read_csv("../data/togo_clean.csv")
sierra_leone = pd.read_csv("../data/sierraleone_clean.csv")

benin['Country'] = 'Benin'
togo['Country'] = 'Togo'
sierra_leone['Country'] = 'Sierra Leone'

df_all = pd.concat([benin, togo, sierra_leone], ignore_index=True)

In [None]:
custom_palette = {
    "Benin": "#0095ff",         # blue
    "Togo": "#ff7f0e",          # orange
    "Sierra Leone": "#09b509"   # green
}

# Boxplot for GHI
sns.boxplot(x='Country', y='GHI', palette=custom_palette, data=df_all)
plt.title('GHI Distribution by Country')
plt.show()

# Boxplot for DNI
sns.boxplot(x='Country', y='DNI', palette=custom_palette, data=df_all)
plt.title('DNI Distribution by Country')
plt.show()

# Boxplot for DHI
sns.boxplot(x='Country', y='DHI', palette=custom_palette, data=df_all)
plt.title('DHI Distribution by Country')
plt.show()


In [None]:
summary = df_all.groupby('Country')[['GHI', 'DNI', 'DHI']].agg(['mean', 'median', 'std']).round(2)
summary

In [None]:
# Extract GHI values per country
ghi_benin = benin["GHI"]
ghi_togo = togo["GHI"]
ghi_sierra = sierra_leone["GHI"]

# One-way ANOVA
f_stat, p_val = f_oneway(ghi_benin, ghi_togo, ghi_sierra)

print(f"ANOVA test p-value: {p_val:.5f}")
if p_val < 0.05:
    print("There is a significant difference in GHI between the countries.")

### Key Observations

- **Benin** shows the **highest average and median GHI**, indicating strong solar potential, but also exhibits relatively high variability.
- **Sierra Leone** has the **lowest GHI values overall**, suggesting comparatively lower solar resource availability.
- **Togo** maintains **moderate GHI levels with low variability**, making it a more stable and predictable location for solar energy projects.


In [None]:
avg_ghi = {
    'Benin': benin['GHI'].mean(),
    'Togo': togo['GHI'].mean(),
    'Sierra Leone': sierra_leone['GHI'].mean()
}

# Convert to DataFrame
avg_ghi_df = pd.DataFrame(list(avg_ghi.items()), columns=['Country', 'Average GHI'])
avg_ghi_df = avg_ghi_df.sort_values(by='Average GHI', ascending=False)

# Plot
plt.figure(figsize=(6, 4))
sns.barplot(data=avg_ghi_df, x='Country', y='Average GHI', palette='Set2')
plt.title('Average GHI by Country')
plt.ylabel('Average GHI')
plt.xlabel('Country')
plt.tight_layout()
plt.show()