# California Weather Data Exploration

**Dataset**: NOAA GHCN Daily Weather Data (California)  
**Date Range**: 2020 - 2025  
**Source**: NOAA Climate Data Online API

**Objective**: 
- Explore California weather station data
- Analyze temperature and precipitation patterns
- Understand weather conditions during January 2025 fires
- Identify key weather features for fire prediction model


In [None]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print("✅ Libraries imported successfully!")


## 1. Load Weather Data


In [None]:
# Load combined weather data
weather_file = Path('../data/raw/weather/california_weather_2020_2025_combined.csv')
stations_file = Path('../data/raw/weather/california_weather_stations.csv')

print(f"Loading weather data from: {weather_file}")
print(f"File exists: {weather_file.exists()}")

weather_df = pd.read_csv(weather_file)
print(f"\n✅ Loaded {len(weather_df):,} weather records!")

# Load station metadata
stations_df = pd.read_csv(stations_file)
print(f"✅ Loaded {len(stations_df):,} weather stations!")

print("\nFirst 5 weather records:")
weather_df.head()


## 2. Dataset Overview


In [None]:
# Dataset info
print("=" * 80)
print("WEATHER DATASET SUMMARY")
print("=" * 80)
print(f"\nTotal records: {len(weather_df):,}")
print(f"Columns: {weather_df.shape[1]}")
print(f"\nColumn names: {weather_df.columns.tolist()}")

# Data types
print("\n\nData types and missing values:")
info_df = pd.DataFrame({
    'Column': weather_df.columns,
    'Data Type': weather_df.dtypes,
    'Non-Null': weather_df.count(),
    'Null': weather_df.isnull().sum(),
    'Null %': (weather_df.isnull().sum() / len(weather_df) * 100).round(2)
})
info_df


## 3. Weather Data Types & Coverage


In [None]:
# Parse dates
weather_df['date'] = pd.to_datetime(weather_df['date'])

# Weather data types
print("Weather Data Types Available:")
datatype_counts = weather_df['datatype'].value_counts()
print(datatype_counts)

# Plot
plt.figure(figsize=(10, 5))
datatype_counts.plot(kind='bar', color='#4A90E2', alpha=0.7)
plt.title('Weather Data Types - Record Count', fontsize=14, fontweight='bold')
plt.xlabel('Data Type')
plt.ylabel('Number of Records')
plt.xticks(rotation=0)
plt.grid(True, axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

# Date range per data type
print("\n\nDate Range by Data Type:")
for dtype in weather_df['datatype'].unique():
    dtype_data = weather_df[weather_df['datatype'] == dtype]
    print(f"{dtype}: {dtype_data['date'].min().date()} to {dtype_data['date'].max().date()}")


## 4. January 2025 Weather Analysis 🔥


In [None]:
# Filter January 2025 weather (when Palisades & Eaton fires occurred)
jan_2025 = weather_df[(weather_df['date'] >= '2025-01-01') & (weather_df['date'] <= '2025-01-31')]
print(f"January 2025 weather records: {len(jan_2025):,}")

# Summary by data type
print("\nJanuary 2025 Weather Summary:")
for dtype in jan_2025['datatype'].unique():
    jan_dtype = jan_2025[jan_2025['datatype'] == dtype]
    print(f"\n{dtype}:")
    print(f"  Records: {len(jan_dtype)}")
    print(f"  Mean value: {jan_dtype['value'].mean():.2f}")
    print(f"  Min value: {jan_dtype['value'].min():.2f}")
    print(f"  Max value: {jan_dtype['value'].max():.2f}")
    
# Show actual January 2025 data
print("\n\nSample of January 2025 data:")
jan_2025.head(10)


## 5. Summary for ML Model


In [None]:
print("=" * 80)
print("WEATHER DATA SUMMARY FOR ML MODEL")
print("=" * 80)

print(f"\n📊 Dataset Size:")
print(f"   Total weather records: {len(weather_df):,}")
print(f"   Date range: {weather_df['date'].min().date()} to {weather_df['date'].max().date()}")
print(f"   Years covered: {weather_df['date'].dt.year.nunique()}")

print(f"\n🌡️ Weather Variables:")
for dtype in weather_df['datatype'].unique():
    count = len(weather_df[weather_df['datatype'] == dtype])
    print(f"   {dtype}: {count:,} records")

print(f"\n📍 Geographic Coverage:")
print(f"   Unique weather stations: {weather_df['station'].nunique():,}")
print(f"   Station metadata available: {len(stations_df):,} stations")

print(f"\n🔥 January 2025 Fire Period:")
print(f"   Weather records for Jan 2025: {len(jan_2025):,}")
print(f"   Covers Palisades & Eaton fires: ✅")

print(f"\n✅ Features Ready for ML Model:")
print(f"   ✅ Max Temperature (TMAX)")
print(f"   ✅ Min Temperature (TMIN)")
print(f"   ✅ Precipitation (PRCP)")
print(f"   ⏳ Wind data (limited availability)")

print(f"\n📝 Next Steps:")
print(f"   1. Match weather stations to fire locations")
print(f"   2. Calculate derived features (temp range, dry days, etc.)")
print(f"   3. Aggregate weather data to daily/weekly summaries")

print("=" * 80)
