# Benin Solar Dataset Exploratory Data Analysis
This notebook performs exploratory data analysis (EDA) on Benin's solar dataset. It includes data cleaning, visualization, and analysis tasks to uncover trends and relationships in the data.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

# Load the dataset
file_path = '../src/data/benin-malanville.csv'
df = pd.read_csv(file_path)

# Ensure Timestamp column is parsed as datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Drop the 'Comments' column as it is entirely empty
df = df.drop(columns=['Comments'])

# Replace negative values in GHI, DNI, and DHI with NaN
columns_to_check_negatives = ['GHI', 'DNI', 'DHI']
df[columns_to_check_negatives] = df[columns_to_check_negatives].applymap(lambda x: x if x >= 0 else np.nan)

# Display summary statistics and missing value report
print("Summary Statistics:")
print(df.describe())

print("\nMissing Values:")
missing_values = df.isna().sum()
print(missing_values)
columns_with_nulls = missing_values[missing_values > len(df) * 0.05].index.tolist()
print("\nColumns with >5% nulls:", columns_with_nulls)

# Handle missing values (drop or impute)
columns_to_impute = ['GHI', 'DNI', 'DHI']
for col in columns_to_impute:
    df[col] = df[col].fillna(df[col].median())  # Avoid chained assignment warning

df = df.dropna(thresh=len(df.columns) * 0.95)  # Drop rows with more than 5% missing values

# Detect and handle outliers using Z-scores
columns_to_check = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']
z_scores = np.abs(zscore(df[columns_to_check]))
outliers = (z_scores > 4)  # Relaxed threshold to 4

df = df[(~outliers).all(axis=1)]  # Remove outliers

rows_removed = len(outliers) - len(df)
print(f"Number of rows removed due to outliers: {rows_removed}")

# Save the cleaned dataset
output_path = '../src/data/benin_clean.csv'
if not df.empty:
    df.to_csv(output_path, index=False)
    print(f"Cleaned data saved to {output_path}")
else:
    print("DataFrame is empty. No data to save.")

# Data Cleaning
This section handles missing values, outliers, and ensures the dataset is ready for analysis.

In [None]:
# Time Series Analysis
import matplotlib.dates as mdates

if not df.empty:
    plt.figure(figsize=(14, 7))

    # Plot GHI, DNI, and DHI over time
    plt.plot(df['Timestamp'], df['GHI'], label='GHI', alpha=0.8, color='orange')
    plt.plot(df['Timestamp'], df['DNI'], label='DNI', alpha=0.8, color='blue')
    plt.plot(df['Timestamp'], df['DHI'], label='DHI', alpha=0.8, color='green')

    # Formatting the plot
    plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    plt.gca().xaxis.set_major_locator(mdates.MonthLocator(interval=1))
    plt.gcf().autofmt_xdate()
    plt.xlabel('Timestamp')
    plt.ylabel('Irradiance (W/m²)')
    plt.title('Solar Irradiance Over Time')
    plt.legend()
    plt.grid(True)
    plt.show()
else:
    print("DataFrame is empty. No data to plot.")

# Time Series Analysis
Visualize solar irradiance trends over time to identify patterns and anomalies.

In [None]:
# Cleaning Impact Analysis
if 'Cleaning' in df.columns:
    cleaning_groups = df.groupby('Cleaning')[['ModA', 'ModB']].mean()
    cleaning_groups.plot(kind='bar', figsize=(10, 6), color=['skyblue', 'lightgreen'])
    plt.title('Average ModA & ModB Pre/Post-Clean')
    plt.ylabel('Average Sensor Readings')
    plt.xlabel('Cleaning Flag (0 = Pre-Clean, 1 = Post-Clean)')
    plt.legend(['ModA', 'ModB'])
    plt.grid(axis='y')
    plt.show()
else:
    print("Cleaning flag column not found in the DataFrame.")

# Cleaning Impact Analysis
Analyze the effect of cleaning operations on sensor readings.

In [None]:
# Correlation & Relationship Analysis
if not df.empty:
    # Compute correlation matrix
    correlation_matrix = df[['GHI', 'DNI', 'DHI', 'ModA', 'ModB']].corr()

    # Plot heatmap for correlations
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    plt.title('Correlation Heatmap')
    plt.show()

    # Scatter plots for relationships
    scatter_columns = [('WS', 'GHI'), ('WSgust', 'GHI'), ('WD', 'GHI'), ('RH', 'Tamb'), ('RH', 'GHI')]
    for x_col, y_col in scatter_columns:
        plt.figure(figsize=(8, 6))
        sns.scatterplot(data=df, x=x_col, y=y_col, alpha=0.7)
        plt.title(f'{x_col} vs. {y_col}')
        plt.xlabel(x_col)
        plt.ylabel(y_col)
        plt.grid(True)
        plt.show()
else:
    print("DataFrame is empty. Correlation and relationship analysis cannot be performed.")

# Correlation & Relationship Analysis
Explore relationships between variables using heatmaps and scatter plots.

In [None]:
# Wind & Distribution Analysis
if not df.empty:
    # Wind Rose Plot
    if 'WS' in df.columns and 'WD' in df.columns:
        plt.figure(figsize=(8, 8))
        sns.histplot(df, x='WD', hue='WS', bins=36, palette='viridis', kde=False)
        plt.title('Wind Rose Plot')
        plt.xlabel('Wind Direction (Degrees)')
        plt.ylabel('Frequency')
        plt.grid(True)
        plt.show()
    else:
        print("Columns 'WS' or 'WD' not found in the DataFrame.")

    # Histogram for GHI
    if 'GHI' in df.columns:
        plt.figure(figsize=(8, 6))
        sns.histplot(df['GHI'], bins=30, kde=True, color='blue')
        plt.title('Histogram of GHI')
        plt.xlabel('Global Horizontal Irradiance (W/m²)')
        plt.ylabel('Frequency')
        plt.grid(True)
        plt.show()
    else:
        print("Column 'GHI' not found in the DataFrame.")
else:
    print("DataFrame is empty. Wind and distribution analysis cannot be performed.")

# Wind & Distribution Analysis
Visualize wind direction and speed distribution, along with irradiance histograms.

In [None]:
# Temperature Analysis
if not df.empty:
    if 'RH' in df.columns and 'Tamb' in df.columns:
        plt.figure(figsize=(8, 6))
        sns.scatterplot(data=df, x='RH', y='Tamb', alpha=0.7, color='red')
        plt.title('Relative Humidity vs Temperature')
        plt.xlabel('Relative Humidity (%)')
        plt.ylabel('Temperature (°C)')
        plt.grid(True)
        plt.show()
    else:
        print("Columns 'RH' or 'Tamb' not found in the DataFrame.")

    if 'RH' in df.columns and 'GHI' in df.columns:
        plt.figure(figsize=(8, 6))
        sns.scatterplot(data=df, x='RH', y='GHI', alpha=0.7, color='green')
        plt.title('Relative Humidity vs Global Horizontal Irradiance')
        plt.xlabel('Relative Humidity (%)')
        plt.ylabel('Global Horizontal Irradiance (W/m²)')
        plt.grid(True)
        plt.show()
    else:
        print("Columns 'RH' or 'GHI' not found in the DataFrame.")
else:
    print("DataFrame is empty. Temperature analysis cannot be performed.")

# Temperature Analysis
Analyze how relative humidity influences temperature and solar radiation.

In [None]:
# Bubble Chart
if not df.empty:
    if 'GHI' in df.columns and 'Tamb' in df.columns and 'RH' in df.columns:
        plt.figure(figsize=(10, 8))
        plt.scatter(df['GHI'], df['Tamb'], s=df['RH']*10, alpha=0.5, c='blue', edgecolors='w', linewidth=0.5)
        plt.title('Bubble Chart: GHI vs Temperature with RH as Bubble Size')
        plt.xlabel('Global Horizontal Irradiance (W/m²)')
        plt.ylabel('Temperature (°C)')
        plt.grid(True)
        plt.show()
    else:
        print("Columns 'GHI', 'Tamb', or 'RH' not found in the DataFrame.")
else:
    print("DataFrame is empty. Bubble chart cannot be created.")

# Bubble Chart
Visualize the relationship between irradiance and temperature, with humidity as bubble size.