In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

def preprocess_taiwan_air_quality_data(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path, encoding='utf-8')
    
    # Rename columns for consistency
    df.columns = ['station', 'date', 'parameter'] + list(range(24))
    
    # Melt the dataframe to convert hourly columns to rows
    df_melted = pd.melt(df, id_vars=['station', 'date', 'parameter'], 
                        var_name='hour', value_name='value')
    
    # Combine date and hour
    df_melted['datetime'] = pd.to_datetime(df_melted['date']) + pd.to_timedelta(df_melted['hour'].astype(int), unit='h')
    
    # Remove unnecessary columns
    df_melted = df_melted.drop(['date', 'hour'], axis=1)
    
    # Handle invalid values
    invalid_values = ['#', '*', 'x', 'A', 'NR', '']
    for val in invalid_values:
        df_melted.loc[df_melted['value'] == val, 'value'] = np.nan
    
    # Convert value column to float
    df_melted['value'] = df_melted['value'].astype(float)
    
    # Pivot the dataframe to have parameters as columns
    df_pivot = df_melted.pivot_table(index=['station', 'datetime'], 
                                     columns='parameter', 
                                     values='value').reset_index()
    
    # Handle missing values
    numeric_columns = df_pivot.select_dtypes(include=[np.number]).columns
    imputer = SimpleImputer(strategy='mean')
    df_pivot[numeric_columns] = imputer.fit_transform(df_pivot[numeric_columns])
    
    # Create additional time-based features
    df_pivot['year'] = df_pivot['datetime'].dt.year
    df_pivot['month'] = df_pivot['datetime'].dt.month
    df_pivot['day'] = df_pivot['datetime'].dt.day
    df_pivot['hour'] = df_pivot['datetime'].dt.hour
    df_pivot['day_of_week'] = df_pivot['datetime'].dt.dayofweek
    df_pivot['is_weekend'] = df_pivot['day_of_week'].isin([5, 6]).astype(int)
    
    # Convert wind direction to radians for circular analysis
    df_pivot['WD_HR_RAD'] = np.radians(df_pivot['WD_HR'])
    df_pivot['WIND_DIREC_RAD'] = np.radians(df_pivot['WIND_DIREC'])
    
    return df_pivot

# Usage example:
processed_df = preprocess_taiwan_air_quality_data('./air_2023/三義_2023.csv')
print(processed_df.head())
print(processed_df.columns)



ValueError: Length mismatch: Expected axis has 1 elements, new values have 27 elements