In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

def clean_weather_data(file_path: Path, new_value_col_name: str, county: str) -> pd.DataFrame:
    """
    Reads a weather data file, which assumes the first row is a metadata header 
    (consumed by pandas) and the subsequent two rows are also to be dropped.

    The function performs the following steps:
    1. Reads the data (assuming the first row is the header).
    2. Renames the columns to ['Date', 'Value'].
    3. Deletes the first two rows of data (metadata).
    4. Splits 'Date' column into 'Year' and 'Month'.
    5. Renames 'Value' to the specified new column name.
    6. Returns a DataFrame with columns ['Year', 'Month', new_value_col_name].

    Args:
        file_path: The Path object pointing to the input data file (e.g., CSV).
        new_value_col_name: The desired new name for the temperature 'Value' column.

    Returns:
        A cleaned and processed pandas DataFrame.
    """
    try:
        # Read the file assuming the default header=0 (first line is consumed as header).
        df = pd.read_csv(str(file_path))
    except FileNotFoundError:
        # Handle case where the file path is incorrect
        print(f"Error: File not found at path: {file_path}")
        return pd.DataFrame()
    except Exception as e:
        # Handle other read errors
        print(f"Error reading file: {e}")
        return pd.DataFrame()

    # --- Cleaning Steps based on User's Logic ---

    # Rename the columns explicitly (assumes the file structure results in a 2-column DataFrame)
    new_column_names = ['Date', 'Value']
    
    # Check if the number of column labels matches the number of columns
    if len(df.columns) != len(new_column_names):
        print(f"Error: Expected {len(new_column_names)} columns, but found {len(df.columns)}.")
        print("Please check the file structure or the 'new_column_names' list.")
        return pd.DataFrame()

    df.columns = new_column_names

    # Delete the first two rows of data frame (metadata/header text rows)
    # The original script uses df.iloc[2:], which removes rows 0 and 1.
    df = df.iloc[2:].copy()

    # Reset the index, dropping the old index as a column
    df = df.reset_index(drop=True)

    # Create 'Year' column from the first 4 characters and convert to integer
    df['Year'] = df['Date'].str[:4].astype(int)

    # Create 'Month' column from the last 2 characters and convert to integer
    df['Month'] = df['Date'].str[4:].astype(int)

    # Delete the Date Column:
    df = df.drop(columns=['Date'])

    # Rename 'Value' column to the input name
    df = df.rename(columns={'Value': new_value_col_name})
    
    df['County'] = county

    # Define the desired order of columns
    desired_columns = ['County', 'Year', 'Month', new_value_col_name]

    # Select only the desired columns in the specified order
    df_cleaned = df[desired_columns]

    return df_cleaned

In [4]:
# 1-ALAMEDA County Data Cleaning
path = Path('../data/raw/weather/1-ALAMEDA/data.csv')
df_avg_temp = clean_weather_data(path, 'Avg_Temp', 'ALAMEDA')

path = Path('../data/raw/weather/1-ALAMEDA/data (1).csv')
df_max_temp = clean_weather_data(path, 'Max_Temp', 'ALAMEDA')
path = Path('../data/raw/weather/1-ALAMEDA/data (2).csv')
df_min_temp = clean_weather_data(path, 'Min_Temp', 'ALAMEDA')

path = Path('../data/raw/weather/1-ALAMEDA/data (3).csv')
df_precipitation = clean_weather_data(path, 'Precipitation', 'ALAMEDA')

# ALAMEDA County Data Integration
df_avg_temp['Max_Temp'] = df_max_temp['Max_Temp']
df_avg_temp['Min_Temp'] = df_min_temp['Min_Temp']
df_avg_temp['Precipitation'] = df_precipitation['Precipitation']
df_avg_temp.to_csv('../data/processed/weather/1-ALAMEDA.csv', index=False)

df_avg_temp.head()

In [6]:
# 2-ALPINE County Data Cleaning
path = Path('../data/raw/weather/2-ALPINE/data.csv')
df_avg_temp = clean_weather_data(path, 'Avg_Temp', 'ALPINE')

path = Path('../data/raw/weather/2-ALPINE/data (1).csv')
df_max_temp = clean_weather_data(path, 'Max_Temp', 'ALPINE')
path = Path('../data/raw/weather/2-ALPINE/data (2).csv')
df_min_temp = clean_weather_data(path, 'Min_Temp', 'ALPINE')

path = Path('../data/raw/weather/2-ALPINE/data (3).csv')
df_precipitation = clean_weather_data(path, 'Precipitation', 'ALPINE')

# ALAMEDA County Data Integration
df_avg_temp['Max_Temp'] = df_max_temp['Max_Temp']
df_avg_temp['Min_Temp'] = df_min_temp['Min_Temp']
df_avg_temp['Precipitation'] = df_precipitation['Precipitation']
df_avg_temp.to_csv('../data/processed/weather/2-ALPINE.csv', index=False)

df_avg_temp.head()

Unnamed: 0,County,Year,Month,Avg_Temp,Max_Temp,Min_Temp,Precipitation
0,ALPINE,2000,1,31.7,38.4,25.0,10.06
1,ALPINE,2000,2,32.7,40.9,24.5,9.57
2,ALPINE,2000,3,35.2,46.4,24.0,1.67
3,ALPINE,2000,4,42.1,54.0,30.1,1.78
4,ALPINE,2000,5,47.7,58.8,36.5,2.35


In [8]:
# 3-AMADOR County Data Cleaning
path = Path('../data/raw/weather/3-AMADOR/data.csv')
df_avg_temp = clean_weather_data(path, 'Avg_Temp', 'AMADOR')

path = Path('../data/raw/weather/3-AMADOR/data (1).csv')
df_max_temp = clean_weather_data(path, 'Max_Temp', 'AMADOR')
path = Path('../data/raw/weather/3-AMADOR/data (2).csv')
df_min_temp = clean_weather_data(path, 'Min_Temp', 'AMADOR')

path = Path('../data/raw/weather/3-AMADOR/data (3).csv')
df_precipitation = clean_weather_data(path, 'Precipitation', 'AMADOR')

# ALAMEDA County Data Integration
df_avg_temp['Max_Temp'] = df_max_temp['Max_Temp']
df_avg_temp['Min_Temp'] = df_min_temp['Min_Temp']
df_avg_temp['Precipitation'] = df_precipitation['Precipitation']
df_avg_temp.to_csv('../data/processed/weather/3-AMADOR.csv', index=False)

df_avg_temp.head()

Unnamed: 0,County,Year,Month,Avg_Temp,Max_Temp,Min_Temp,Precipitation
0,AMADOR,2000,1,44.3,51.7,36.9,11.28
1,AMADOR,2000,2,46.2,54.2,38.1,12.87
2,AMADOR,2000,3,49.5,60.6,38.4,2.04
3,AMADOR,2000,4,56.0,68.7,43.2,2.42
4,AMADOR,2000,5,60.7,73.6,47.9,3.5


In [9]:
# 4-BUTTE County Data Cleaning
path = Path('../data/raw/weather/4-BUTTE/data.csv')
df_avg_temp = clean_weather_data(path, 'Avg_Temp', 'BUTTE')

path = Path('../data/raw/weather/4-BUTTE/data (1).csv')
df_max_temp = clean_weather_data(path, 'Max_Temp', 'BUTTE')
path = Path('../data/raw/weather/4-BUTTE/data (2).csv')
df_min_temp = clean_weather_data(path, 'Min_Temp', 'BUTTE')

path = Path('../data/raw/weather/4-BUTTE/data (3).csv')
df_precipitation = clean_weather_data(path, 'Precipitation', 'BUTTE')

# ALAMEDA County Data Integration
df_avg_temp['Max_Temp'] = df_max_temp['Max_Temp']
df_avg_temp['Min_Temp'] = df_min_temp['Min_Temp']
df_avg_temp['Precipitation'] = df_precipitation['Precipitation']
df_avg_temp.to_csv('../data/processed/weather/4-BUTTE.csv', index=False)

df_avg_temp.head()

Unnamed: 0,County,Year,Month,Avg_Temp,Max_Temp,Min_Temp,Precipitation
0,BUTTE,2000,1,45.5,53.0,38.0,9.81
1,BUTTE,2000,2,47.6,55.0,40.2,16.67
2,BUTTE,2000,3,51.7,63.0,40.4,4.81
3,BUTTE,2000,4,58.5,71.6,45.4,3.05
4,BUTTE,2000,5,63.4,76.1,50.7,1.76


In [None]:
# 4-BUTTE County Data Cleaning
path = Path('../data/raw/weather/4-BUTTE/data.csv')
df_avg_temp = clean_weather_data(path, 'Avg_Temp', 'BUTTE')

path = Path('../data/raw/weather/4-BUTTE/data (1).csv')
df_max_temp = clean_weather_data(path, 'Max_Temp', 'BUTTE')
path = Path('../data/raw/weather/4-BUTTE/data (2).csv')
df_min_temp = clean_weather_data(path, 'Min_Temp', 'BUTTE')

path = Path('../data/raw/weather/4-BUTTE/data (3).csv')
df_precipitation = clean_weather_data(path, 'Precipitation', 'BUTTE')

# ALAMEDA County Data Integration
df_avg_temp['Max_Temp'] = df_max_temp['Max_Temp']
df_avg_temp['Min_Temp'] = df_min_temp['Min_Temp']
df_avg_temp['Precipitation'] = df_precipitation['Precipitation']
df_avg_temp.to_csv('../data/processed/weather/4-BUTTE.csv', index=False)

df_avg_temp.head()