In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from scipy.stats import linregress

In [None]:
data = pd.read_csv('climate data.csv')

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data.shape

In [None]:
data['DATE'] = pd.to_datetime(data['DATE'])

In [None]:
numeric_columns = data.select_dtypes(include=[np.number]).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

In [None]:
categorical_columns = data.select_dtypes(include=[object]).columns
for column in categorical_columns:
    data[column] = data[column].fillna(data[column].mode()[0])

In [None]:
data['Sunrise'] = pd.to_datetime(data['Sunrise'], errors='coerce').fillna(method='ffill')
data['Sunset'] = pd.to_datetime(data['Sunset'], errors='coerce').fillna(method='ffill')

In [None]:
data['WindEquipmentChangeDate'] = pd.to_datetime(data['WindEquipmentChangeDate'], errors='coerce').fillna(pd.Timestamp('1900-01-01'))

In [None]:
data['WindEquipmentChangeDate'] = pd.to_datetime(data['WindEquipmentChangeDate'], errors='coerce').fillna(pd.Timestamp('1900-01-01'))

In [None]:
# Verify data types
data.dtypes

In [None]:
# Display the first few rows after preprocessing
data.head()

In [None]:
missing_values = data.isnull().sum()
print("Missing values per column:\n", missing_values)

In [None]:
missing_percentage = (data.isnull().sum() / len(data)) * 100
print("\nPercentage of missing values per column:\n", missing_percentage)

In [None]:
missing_summary = pd.DataFrame({
      'Missing Values': missing_values,
          'Percentage (%)': missing_percentage
          })

print("\nSummary of missing data:\n")
print(missing_summary)

In [None]:
missing_summary.to_csv("missing_values_summary.csv", index=True)

In [None]:
duplicate_count = data.duplicated().sum()
print(f"Total duplicate rows: {duplicate_count}")

In [None]:
if duplicate_count > 0:
      print("\nDuplicate rows:\n", data[data.duplicated()])


In [None]:
plt.figure(figsize=(14, 8))

In [None]:
# Daily Average Dry Bulb Temperature over time
plt.subplot(3, 1, 1)
plt.plot(data['DATE'], data['DailyAverageDryBulbTemperature'], label='Avg Dry Bulb Temp')
plt.title('Daily Average Dry Bulb Temperature Over Time')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.legend()

In [None]:
# Daily Precipitation over time
plt.subplot(3, 1, 2)
plt.plot(data['DATE'], data['DailyPrecipitation'], label='Daily Precipitation', color='green')
plt.title('Daily Precipitation Over Time')
plt.xlabel('Date')
plt.ylabel('Precipitation (mm)')
plt.legend()

In [None]:
# Daily Average Wind Speed over time
plt.subplot(3, 1, 3)
plt.plot(data['DATE'], data['DailyAverageWindSpeed'], label='Avg Wind Speed', color='red')
plt.title('Daily Average Wind Speed Over Time')
plt.xlabel('Date')
plt.ylabel('Wind Speed (m/s)')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# 1. Line Plots for Time Series Data
plt.figure(figsize=(14, 8))

# Daily Average Dry Bulb Temperature over time
plt.subplot(3, 1, 1)
plt.plot(data['DATE'], data['DailyAverageDryBulbTemperature'], label='Avg Dry Bulb Temp')
plt.title('Daily Average Dry Bulb Temperature Over Time')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.legend()

# Daily Precipitation over time
plt.subplot(3, 1, 2)
plt.plot(data['DATE'], data['DailyPrecipitation'], label='Daily Precipitation', color='green')
plt.title('Daily Precipitation Over Time')
plt.xlabel('Date')
plt.ylabel('Precipitation (mm)')
plt.legend()

# Daily Average Wind Speed over time
plt.subplot(3, 1, 3)
plt.plot(data['DATE'], data['DailyAverageWindSpeed'], label='Avg Wind Speed', color='red')
plt.title('Daily Average Wind Speed Over Time')
plt.xlabel('Date')
plt.ylabel('Wind Speed (m/s)')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# 4. Scatter Plots for Examining Relationships
# Example: Daily Average Dry Bulb Temperature vs. Daily Precipitation
plt.figure(figsize=(8, 6))
plt.scatter(data['DailyAverageDryBulbTemperature'], data['DailyPrecipitation'], alpha=0.5)
plt.title('Daily Average Dry Bulb Temperature vs. Daily Precipitation')
plt.xlabel('Daily Average Dry Bulb Temperature (°C)')
plt.ylabel('Daily Precipitation (mm)')
plt.show()

# Example: Daily Average Dry Bulb Temperature vs. Daily Average Wind Speed
plt.figure(figsize=(8, 6))
plt.scatter(data['DailyAverageDryBulbTemperature'], data['DailyAverageWindSpeed'], alpha=0.5, color='red')
plt.title('Daily Average Dry Bulb Temperature vs. Daily Average Wind Speed')
plt.xlabel('Daily Average Dry Bulb Temperature (°C)')
plt.ylabel('Daily Average Wind Speed (m/s)')
plt.show()