# Sierra Leone - Bumbuna Solar Dataset EDA
This notebook performs exploratory data analysis (EDA) on Sierra Leone's solar dataset.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

# Load the dataset
df = pd.read_csv('../data/sierraleone-bumbuna.csv')
df.head()

## 1. Summary Statistics & Missing Values

In [None]:
# Describe numeric columns
df.describe()

In [None]:
# Count missing values
df.isna().sum().sort_values(ascending=False)

## 2. Outlier Detection and Basic Cleaning

In [None]:
# Columns to check for outliers
columns_to_check = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
z_scores = df[columns_to_check].apply(zscore)
outliers = (z_scores.abs() > 3)
df_clean = df[~outliers.any(axis=1)].copy()

# Impute remaining missing values with median
df_clean.fillna(df_clean.median(numeric_only=True), inplace=True)

## 3. Time Series Trends

In [None]:
df_clean['Timestamp'] = pd.to_datetime(df_clean['Timestamp'])
df_clean.set_index('Timestamp', inplace=True)

df_clean[['GHI', 'DNI', 'DHI', 'Tamb']].plot(figsize=(14, 6), title='Irradiance and Temperature over Time')

## 4. Correlation Heatmap

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df_clean.corr(numeric_only=True), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')

## 5. Save Cleaned Data (Do not commit this file to Git)

In [None]:
df_clean.to_csv('../data/sierraleone_clean.csv', index=False)