In [None]:
import pandas as pd
#%pip install scipy

df = pd.read_csv("sierraleone-bumbuna.csv")  # example filename

# Summary statistics
df.describe()


In [None]:
# Missing value report
missing = df.isna().sum()
missing[missing > 0]

# Columns with >5% missing
threshold = 0.05 * len(df)
missing[missing > threshold]
# Drop columns with >5% missing


In [None]:
rom scipy.stats import zscore
import numpy as np

# Relevant columns
cols = ['GHI', 'DNI', 'DHI', 'ModA', 'ModB', 'WS', 'WSgust']


In [None]:
# Z-score method
z_scores = df[cols].apply(zscore)
outliers = (z_scores.abs() > 3) 
#View rows with outliers
df[outliers.any(axis=1)]

# Impute missing values

In [None]:
f[cols] = df[cols].fillna(df[cols].median())

df.to_csv("data/sierraleone_clean.csv", index=False)
#save the cleaned data


In [None]:
#%pip install matplotlib
import matplotlib.pyplot as plt
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
df.set_index('Timestamp', inplace=True)
# Plot GHI, DNI, DHI, Tamb over time
df[['GHI', 'DNI', 'DHI', 'Tamb']].plot(figsize=(12,6), alpha=0.7)
plt.title("Solar Irradiance & Temperature Over Time")
plt.ylabel("Value")
plt.show()


In [None]:
df.groupby('Cleaning')[['ModA', 'ModB']].mean().plot(kind='bar')
plt.title("Sensor Readings Before & After Cleaning")
plt.show()


In [None]:
#%pip install seaborn
import seaborn as sns
sns.heatmap(df[cols + ['TModA', 'TModB']].corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()
# Plotting histograms for each column


In [None]:
sns.scatterplot(x='WS', y='GHI', data=df)
sns.scatterplot(x='RH', y='Tamb', data=df)
sns.scatterplot(x='RH', y='GHI', data=df)
sns.scatterplot(x='WSgust', y='GHI', data=df)
sns.scatterplot(x='WD', y='GHI', data=df)
plt.figure(figsize=(15, 10))
plt.show()


In [None]:
%pip install windrose
from windrose import WindroseAxes
ax = WindroseAxes.from_ax()
ax.bar(df['WD'], df['WS'], normed=True, opening=0.8, edgecolor='white')
ax.set_legend()
df[['GHI', 'WS']].hist(bins=30, figsize=(10,4))
sns.scatterplot(x='RH', y='Tamb', hue='GHI', data=df)
plt.title("Scatter Plot of RH vs Tamb with GHI Color Coding")
plt.xlabel("Relative Humidity (RH)")
plt.ylabel("Ambient Temperature (Tamb)")
plt.show()


In [None]:
plt.figure(figsize=(10,6))
plt.scatter(df['GHI'], df['Tamb'], s=df['RH'], alpha=0.5)
plt.xlabel('GHI')
plt.ylabel('Tamb')
plt.title('Bubble Chart: GHI vs Tamb (Bubble = RH)')
