In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as stats 
from scipy.stats import zscore
import matplotlib.pyplot as plt
from windrose import WindroseAxes

In [None]:
#Loading the dataset
df = pd.read_csv("data/togo-dapaong_qc.csv")

In [None]:
df.describe() #stat

In [None]:
df.isna().sum() #missing values

In [None]:
missing_percentage = df.isna().mean() * 100
high_null_columns = missing_percentage[missing_percentage > 5]

print("Columns with >5% missing values:")
print(high_null_columns)

In [None]:
cols_to_check = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust'] #outliers of these
z_scores = df[cols_to_check].apply(zscore)
outliers = (np.abs(z_scores) > 3)
print("Outlier counts per column (|Z| > 3):")
print(outliers.sum())

In [None]:
df[cols_to_check] = df[cols_to_check].mask(outliers) #make outliers NaN

In [None]:
df_clean = df.copy() #make missing values median
for col in cols_to_check:
    median_value = df_clean[col].median()
    df_clean[col] = df_clean[col].fillna(median_value)

In [None]:
df_clean.to_csv("data/togo_clean.csv", index=False)

In [None]:
df_clean['Timestamp'] = pd.to_datetime(df_clean['Timestamp'])


In [None]:
plt.figure(figsize=(14, 7))

plt.plot(df_clean['Timestamp'], df_clean['GHI'], label='GHI')
plt.plot(df_clean['Timestamp'], df_clean['DNI'], label='DNI')
plt.plot(df_clean['Timestamp'], df_clean['DHI'], label='DHI')
plt.plot(df_clean['Timestamp'], df_clean['Tamb'], label='Tamb')

plt.xlabel('Timestamp')
plt.ylabel('Value')
plt.title('Solar Irradiance and Temperature Over Time')
plt.legend()
plt.show()

In [None]:
df_clean['Month'] = df_clean['Timestamp'].dt.month

monthly_avg = df_clean.groupby('Month')[['GHI', 'DNI', 'DHI', 'Tamb']].mean()

monthly_avg.plot(kind='bar', figsize=(12, 6))
plt.title('Average Solar Irradiance and Temperature by Month')
plt.xlabel('Month')
plt.ylabel('Average Value')
plt.show()

In [None]:
cols = ['GHI', 'DNI', 'DHI', 'Tamb']

# Compute z-scores
z_scores_clean = df_clean[cols].apply(zscore)

# Identify rows where any absolute z-score > 3
outliers_clean = (z_scores_clean.abs() > 3).any(axis=1)
df_outliers_clean = df_clean[outliers_clean]

In [None]:
df_clean['Timestamp'] = pd.to_datetime(df_clean['Timestamp'])
df_clean.set_index('Timestamp', inplace=True)
df_outliers_clean.set_index('Timestamp', inplace=True)

In [None]:
plt.figure(figsize=(12, 5))
sns.lineplot(data=df_clean, x=df_clean.index, y='GHI', label='GHI')
sns.scatterplot(data=df_outliers_clean, x=df_outliers_clean.index, y='GHI', color='red', label='Anomalies')
plt.title("GHI with Anomalies Highlighted")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12, 5)) 
sns.lineplot(data=df_clean, x=df_clean.index, y='DNI', label='DNI')
sns.scatterplot(data=df_outliers_clean, x=df_outliers_clean.index, y='DNI', color='red', label='Anomalies')
plt.title("DNI with Anomalies Highlighted")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12, 5))
sns.lineplot(data=df_clean, x=df_clean.index, y='DHI', label='DHI')
sns.scatterplot(data=df_outliers_clean, x=df_outliers_clean.index, y='DHI', color='red', label='Anomalies')
plt.title("DHI with Anomalies Highlighted")
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(12, 5))
sns.lineplot(data=df_clean, x=df_clean.index, y='Tamb', label='Tamb')
sns.scatterplot(data=df_outliers_clean, x=df_outliers_clean.index, y='Tamb', color='red', label='Anomalies')
plt.title("Tamb with Anomalies Highlighted")
plt.legend()
plt.show()


In [None]:
#Cleaning effect on ModA and ModB
cleaning_effect = df_clean.groupby('Cleaning')[['ModA', 'ModB']].mean().reset_index()

# Melt the dataframe for better Seaborn plotting
melted = cleaning_effect.melt(id_vars='Cleaning', value_vars=['ModA', 'ModB'],
                               var_name='Module', value_name='Average Irradiance')

# Plot
plt.figure(figsize=(8, 5))
sns.barplot(data=melted, x='Cleaning', y='Average Irradiance', hue='Module')
plt.title('Effect of Cleaning on ModA and ModB')
plt.xlabel('Cleaning (0 = Before, 1 = After)')
plt.ylabel('Average Irradiance')
plt.xticks([0, 1], ['Before Cleaning', 'After Cleaning'])
plt.tight_layout()
plt.show()

In [None]:
#Heatmap of Correlations
corr_cols = ['GHI', 'DNI', 'DHI', 'TModA', 'TModB']
corr_matrix = df_clean[corr_cols].corr()

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

In [None]:
#Scatter plots: WS, WSgust, WD vs. GHI
# WS vs. GHI
sns.scatterplot(data=df_clean, x='WS', y='GHI', size=1, legend=False)
plt.title('Wind Speed (WS) vs GHI')
plt.show()

# WSgust vs. GHI
sns.scatterplot(data=df_clean, x='WSgust', y='GHI', size=1, legend=False)
plt.title('Wind Gust (WSgust) vs GHI')
plt.show()

# WD vs. GHI
sns.scatterplot(data=df_clean, x='WD', y='GHI', size=1, legend=False)
plt.title('Wind Direction (WD) vs GHI')
plt.show()


In [None]:
# RH vs. Tamb
sns.scatterplot(data=df_clean, x='RH', y='Tamb', size=1, legend=False)
plt.title('Relative Humidity (RH) vs Temperature (Tamb)')
plt.show()

# RH vs. GHI
sns.scatterplot(data=df_clean, x='RH', y='GHI', size=1, legend=False)
plt.title('Relative Humidity (RH) vs GHI')
plt.show()

In [None]:
#Wind rose or radial bar plot of WS/WD
df_wind = df_clean[['WS', 'WD']].dropna()

ax = WindroseAxes.from_ax()
ax.bar(df_wind['WD'], df_wind['WS'], normed=True, opening=0.8, edgecolor='white')
ax.set_legend()
plt.title('Wind Rose')
plt.show()

In [None]:
#Histograms of of GHI and WS:
# Histogram for GHI
sns.histplot(df_clean['GHI'], kde=True, bins=30, color='orange')
plt.title('Distribution of GHI')
plt.xlabel('GHI')
plt.ylabel('Frequency')
plt.show()

# Histogram for WS
sns.histplot(df_clean['WS'], kde=True, bins=30, color='skyblue')
plt.title('Distribution of Wind Speed (WS)')
plt.xlabel('WS')
plt.ylabel('Frequency')
plt.show()

In [None]:
#Relative humidity with temperature readings and solar radiation.

fig, axs = plt.subplots(2, 2, figsize=(14, 10), sharex=True)

# RH vs Tamb
sns.scatterplot(data=df_clean, x='RH', y='Tamb', ax=axs[0, 0])
axs[0, 0].set_title('RH vs Tamb')

# RH vs TModA
sns.scatterplot(data=df_clean, x='RH', y='TModA', ax=axs[0, 1])
axs[0, 1].set_title('RH vs TModA')

# RH vs TModB
sns.scatterplot(data=df_clean, x='RH', y='TModB', ax=axs[1, 0])
axs[1, 0].set_title('RH vs TModB')

# RH vs GHI
sns.scatterplot(data=df_clean, x='RH', y='GHI', ax=axs[1, 1])
axs[1, 1].set_title('RH vs GHI')

# Add a shared X label
for ax in axs.flat:
    ax.set_xlabel('Relative Humidity (%)')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(
    data=df_clean, 
    x='GHI', 
    y='Tamb', 
    size='RH', 
    sizes=(20, 200), 
    alpha=0.6
)
plt.title('GHI vs Tamb (Bubble size = RH)')
plt.xlabel('Global Horizontal Irradiance (GHI)')
plt.ylabel('Ambient Temperature (Tamb)')
plt.legend(title='Relative Humidity', loc='upper right')
plt.tight_layout()
plt.show()
