In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np

# Step 1: Read the dataset (replace 'your_data.csv' with your actual dataset path)
df = pd.read_csv('benin-malanville.csv')

# Step 2: Check for missing values in the critical columns
critical_columns = ['GHI', 'DNI', 'Tamb']  # Add any other critical columns as needed
missing_data = df[critical_columns].isnull().sum()
print(f"Missing data in critical columns:\n{missing_data}\n")

# Step 3a: Interpolation (Fill missing values with interpolated data)

# For numerical columns (GHI, DNI, temperature), use linear interpolation
df[critical_columns] = df[critical_columns].interpolate(method='linear', axis=0)

# You can also use other interpolation methods like 'polynomial' if necessary
# df[critical_columns] = df[critical_columns].interpolate(method='polynomial', order=2, axis=0)

# Step 3b: Alternatively, you can drop rows with missing values in critical columns
# This removes any rows that have NaN values in the selected columns
df = df.dropna(subset=critical_columns)

# Step 4: Verify that missing data has been handled
missing_data_after = df[critical_columns].isnull().sum()
print(f"Missing data after handling:\n{missing_data_after}\n")

# Step 5: Save the cleaned dataset (if necessary)
df.to_csv('cleaned_data.csv', index=False)

In [None]:
import pandas as pd

# Assuming your dataset is stored in a DataFrame called 'df'
# Convert the Timestamp column to a datetime format
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%Y-%m-%d %H:%M')

# Check the result
print(df['Timestamp'].head())

In [None]:
import pandas as pd

# Assuming your dataset is stored in a DataFrame called 'df'

# Define a function to detect outliers using the IQR method
def detect_outliers(df, column):
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    
    # Calculate the IQR (Interquartile Range)
    IQR = Q3 - Q1
    
    # Define the outlier bounds (1.5 times the IQR from Q1 and Q3)
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Identify outliers: values outside the bounds are considered outliers
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    
    return outliers

# Detect outliers for wind speed, temperature, and precipitation
wind_speed_outliers = detect_outliers(df, 'WS')
temperature_outliers = detect_outliers(df, 'Tamb')
precipitation_outliers = detect_outliers(df, 'Precipitation')

# Display the outliers
print("Outliers in Wind Speed:")
print(wind_speed_outliers[['Timestamp', 'WS']])

print("\nOutliers in Temperature:")
print(temperature_outliers[['Timestamp', 'Tamb']])

print("\nOutliers in Precipitation:")
print(precipitation_outliers[['Timestamp', 'Precipitation']])