import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings("ignore")

# Load dataset
df = pd.read_csv('data/benin-malanville.csv')  # adjust path if needed
df.head()


# Summary statistics
summary_stats = df.describe()

# Missing values
missing_values = df.isna().sum()
missing_percent = (missing_values / len(df)) * 100
missing_report = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing %': missing_percent
})
high_missing = missing_report[missing_report['Missing %'] > 5]
missing_report


columns = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']

# Z-score outlier detection
z_scores = np.abs(stats.zscore(df[columns].dropna()))
outliers = (z_scores > 3)
df['Outlier'] = outliers.any(axis=1)

# Count outliers
print(f"Outliers found: {df['Outlier'].sum()} rows")

# Impute missing values (median)
df[columns] = df[columns].apply(lambda x: x.fillna(x.median()))


df.to_csv('data/benin_clean.csv', index=False)


df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df.set_index('Timestamp')[['GHI', 'DNI', 'DHI', 'Tamb']].plot(subplots=True, figsize=(14, 10), title='Solar & Temperature Trends')
plt.tight_layout()
plt.show()
