In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#to ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
data=pd.read_csv("C:\Users\ELU\Desktop\Solar Radiation Measurement Data_EDA & statics\my_project\data\benin-malanville.csv")

In [None]:
data.head()

In [None]:

data.info()

In [None]:
data.nunique()

In [None]:

(data.isnull().sum()/(len(data)))*100

In [None]:
#since the comment and clean colomn doesn't express much value to the analysis we can reduce them
data = data.drop(['Comments'], axis = 1)
data = data.drop(['Cleaning'], axis = 1)
data.info()


In [None]:
# Replace negative values in GHI column with NaN
data['GHI'] = data['GHI'].apply(lambda x: x if x >= 0 else float('NaN'))
data['DNI'] = data['DNI'].apply(lambda x: x if x >= 0 else float('NaN'))
data['DHI'] = data['DHI'].apply(lambda x: x if x >= 0 else float('NaN'))

# Check if negative values in GHI are replaced with NaN
print("Negative Values in GHI replaced with NaN:")
print(data[data['GHI'] < 0])


In [None]:
data.head()

In [None]:
# Check for missing values after replacing negative values with NaN
missing_values_after_replacement = data.isnull().sum()
print("\nMissing Values after replacing negative values with NaN:")
print(missing_values_after_replacement)

In [None]:
# Convert DataFrame values to numeric types
data = data.apply(pd.to_numeric, errors='coerce')

    # Count negative numbers and replace them with 'a'
negative_count = (data._get_numeric_data() < 0).sum().sum()


data[data < 0] = 'a'


In [None]:

data.head()

In [None]:
from pydoc import replace

# Handling missing values: you can choose different strategies based on your data and requirements, such as imputation or removal
# For demonstration purposes, let's drop rows with missing values
data.dropna(inplace=True)

# Reset index after removing rows
data.reset_index(drop=True, inplace=True)

# Verify if missing values
print("\nAfter Data Cleaning:")
print(data.info())

In [None]:

data.head()

In [None]:
# Check if 'Timestamp' column exists
if 'Timestamp' in data.columns:
    # Convert 'Timestamp' column to datetime
    data['Timestamp'] = pd.to_datetime(data['Timestamp'])
    data.set_index('Timestamp', inplace=True)
else:
    print("Timestamp column does not exist in the dataset.")

# Summary Statistics
summary_stats = data.describe()

# Data Quality Check
missing_values = data.isnull().sum()
outliers = data[(np.abs(data - data.mean()) > 3 * data.std())].count()

# Time Series Analysis
if 'Timestamp' in data.columns:
    plt.figure(figsize=(12, 6))
    plt.plot(data['GHI'], label='GHI')
    plt.plot(data['DNI'], label='DNI')
    plt.plot(data['DHI'], label='DHI')
    plt.plot(data['Tamb'], label='Tamb')
    plt.legend()
    plt.title('Solar Radiation and Temperature Over Time')
    plt.xlabel('Timestamp')
    plt.ylabel('Value')
    plt.show()
else:
    print("Cannot perform time series analysis. Timestamp column does not exist in the dataset.")

# Correlation Analysis
correlation_matrix = data.corr()

# Wind Analysis
# Plot wind speed over time
if 'WS' in data.columns:
    plt.figure(figsize=(12, 6))
    plt.plot(data['WS'], label='Wind Speed')
    plt.plot(data['WSgust'], label='Wind Gust Speed')
    plt.plot(data['WSstdev'], label='Wind Speed Std Dev')
    plt.legend()
    plt.title('Wind Speed Over Time')
    plt.xlabel('Timestamp')
    plt.ylabel('Speed (m/s)')
    plt.show()
else:
    print("Cannot perform wind analysis. Wind speed columns do not exist in the dataset.")

# Temperature Analysis
# Compare module temperatures with ambient temperature
if 'Tamb' in data.columns and 'TModA' in data.columns and 'TModB' in data.columns:
    plt.figure(figsize=(10, 6))
    plt.plot(data['Tamb'], label='Ambient Temperature')
    plt.plot(data['TModA'], label='Module A Temperature')
    plt.plot(data['TModB'], label='Module B Temperature')
    plt.legend()
    plt.title('Temperature Comparison')
    plt.xlabel('Timestamp')
    plt.ylabel('Temperature (°C)')
    plt.show()
else:
    print("Cannot perform temperature analysis. Temperature columns do not exist in the dataset.")

# Histograms
data.hist(figsize=(12, 10))
plt.tight_layout()
plt.show()

# Box Plots
plt.figure(figsize=(12, 6))
sns.boxplot(data=data[['GHI', 'DNI', 'DHI', 'Tamb', 'TModA', 'TModB']])
plt.title('Box Plot of Solar Radiation and Temperature')
plt.ylabel('Value')
plt.show()

# Scatter Plots
plt.figure(figsize=(12, 6))
sns.scatterplot(x='GHI', y='Tamb', data=data)
plt.title('GHI vs. Ambient Temperature')
plt.xlabel('GHI (W/m²)')
plt.ylabel('Temperature (°C)')
plt.show()

# Data Cleaning (if necessary)
# Handle missing values or outliers based on analysis

# Check for comments column
if 'Comments' in data.columns:
    comments_null_count = data['Comments'].isnull().sum()
else:
    print("Comments column does not exist in the dataset.")
