In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

file_path = 'PRSA_Data_Aotizhongxin_20130301-20170228.csv' # Or the full path

try:
    # Load the dataset
    df = pd.read_csv(file_path)
    print("Dataset loaded successfully.\n")

    # Combine year, month, day, hour into a single datetime column
    df['datetime'] = pd.to_datetime(df[['year', 'month', 'day', 'hour']], errors='coerce')
    df.set_index('datetime', inplace=True)
    columns_to_drop = ['year', 'month', 'day', 'hour', 'No', 'station']
    df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

    print("--- Initial DataFrame Head ---")
    print(df.head())
    print("\n--- Initial Missing Values ---")
    print(df.isnull().sum())
    print("\n")

    # --- Task 2: Data Preprocessing and Feature Engineering ---

    # 1. Handle Missing Values
    print("--- 1. Handling Missing Values ---")
    # For pollutant and meteorological numerical features, linear interpolation is a common choice for time series.
    # For PM2.5 (target), interpolation is also reasonable.
    numerical_cols_with_na = df.select_dtypes(include=np.number).isnull().sum()
    numerical_cols_to_interpolate = numerical_cols_with_na[numerical_cols_with_na > 0].index.tolist()

    if numerical_cols_to_interpolate:
        print(f"Interpolating numerical columns: {numerical_cols_to_interpolate}")
        for col in numerical_cols_to_interpolate:
            df[col] = df[col].interpolate(method='linear', limit_direction='both') # limit_direction fills NaNs at ends too
    else:
        print("No numerical columns found needing interpolation.")

    # For categorical 'wd' (wind direction), use forward fill then backward fill
    if 'wd' in df.columns and df['wd'].isnull().any():
        print("Filling missing 'wd' using ffill and bfill.")
        df['wd'] = df['wd'].fillna(method='ffill').fillna(method='bfill')
    elif 'wd' not in df.columns:
        print("Warning: 'wd' column not found.")
    else:
        print("'wd' column has no missing values.")


    print("\n--- Missing Values After Handling ---")
    print(df.isnull().sum())
    # If any NaNs remain (e.g., if a whole column was NaN or 'wd' had all NaNs initially),
    # a more robust strategy might be needed, like dropping or more complex imputation.
    # For this dataset, interpolation and ffill/bfill should handle most cases.
    # Let's drop any rows that might still have NaNs in crucial columns (especially target)
    # df.dropna(subset=['PM2.5'], inplace=True) # Ensure target has no NaNs
    # print("\n--- Missing Values After Final DropNA on PM2.5 (if any) ---")
    # print(df.isnull().sum())
    print("\n")


    # 2. Create Time-Based Features
    print("--- 2. Creating Time-Based Features ---")
    df['hour_of_day'] = df.index.hour
    df['day_of_week'] = df.index.dayofweek  # Monday=0, Sunday=6
    df['day_of_year'] = df.index.dayofyear
    df['month'] = df.index.month
    df['year'] = df.index.year # Useful for trends or splitting
    df['week_of_year'] = df.index.isocalendar().week.astype(int)
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

    # Season (approximation)
    def get_season(date):
        month = date.month
        if month in [12, 1, 2]:
            return 'Winter'
        elif month in [3, 4, 5]:
            return 'Spring'
        elif month in [6, 7, 8]:
            return 'Summer'
        else: # 9, 10, 11
            return 'Autumn'
    df['season'] = df.index.to_series().apply(get_season)

    print("Time-based features created:")
    print(df[['hour_of_day', 'day_of_week', 'month', 'season', 'is_weekend']].head())
    print("\n")

    # 3. Handle Categorical Features ('wd' and 'season')
    print("--- 3. Handling Categorical Features (One-Hot Encoding) ---")
    # 'wd' (wind direction) and 'season' are categorical
    categorical_features = ['wd', 'season']
    # Create a copy for one-hot encoding to keep original df cleaner for now
    df_processed = df.copy()

    # Check if categorical features exist before trying to encode
    existing_categorical_features = [col for col in categorical_features if col in df_processed.columns]

    if existing_categorical_features:
        print(f"Applying One-Hot Encoding to: {existing_categorical_features}")
        df_processed = pd.get_dummies(df_processed, columns=existing_categorical_features, prefix=existing_categorical_features, dummy_na=False) # dummy_na=False as we handled NaNs
        print("Categorical features one-hot encoded.")
        print("DataFrame columns after one-hot encoding (sample):")
        print(df_processed.filter(regex='wd_|season_').head())
    else:
        print("No specified categorical features found for one-hot encoding.")
    print("\n")


    # 4. Analyse Correlations
    print("--- 4. Analyzing Correlations ---")
    # Select only numerical columns for correlation matrix
    # This includes original numerical features and new numerical time features
    # Exclude one-hot encoded columns for the main correlation matrix for clarity,
    # or handle them carefully if included.
    numerical_cols_for_corr = df_processed.select_dtypes(include=np.number).columns

    # It's often useful to see correlations with the target variable specifically
    if 'PM2.5' in numerical_cols_for_corr:
        correlation_matrix = df_processed[numerical_cols_for_corr].corr()
        plt.figure(figsize=(18, 15)) # Adjusted size for more features
        sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', fmt=".2f", linewidths=.5) # Annot=False if too cluttered
        plt.title('Correlation Matrix of Numerical Features')
        plt.show()
        print("Displayed correlation matrix heatmap.")

        print("\n--- Top Correlations with PM2.5 ---")
        pm25_correlations = correlation_matrix['PM2.5'].sort_values(ascending=False)
        print(pm25_correlations)
    else:
        print("PM2.5 column not found or not numerical for correlation analysis.")
    print("\n")


    # 5. Normalize or Standardize Features
    print("--- 5. Normalizing/Standardizing Features ---")
    # We will standardize numerical features. This is generally done AFTER splitting data.
    # Here, we demonstrate how to set up the scaler.
    # We will apply it to the `df_processed` which has one-hot encoded features.
    # Identify numerical columns for scaling (excluding the target 'PM2.5' for now,
    # and also excluding already binary one-hot encoded or 'is_weekend' features).

    # Re-identify numerical columns from df_processed
    numerical_features_to_scale = df_processed.select_dtypes(include=np.number).columns.tolist()

    # Exclude target variable and binary/already scaled features if necessary
    if 'PM2.5' in numerical_features_to_scale:
        numerical_features_to_scale.remove('PM2.5') # Target usually not scaled with features

    # Exclude one-hot encoded columns (they are already 0 or 1)
    # and other binary features like 'is_weekend'
    one_hot_cols = [col for col in df_processed.columns if col.startswith(tuple(f"{cat}_" for cat in existing_categorical_features))]
    binary_features = ['is_weekend'] + one_hot_cols

    numerical_features_to_scale = [col for col in numerical_features_to_scale if col not in binary_features]

    if numerical_features_to_scale:
        print(f"Numerical features to be standardized: {numerical_features_to_scale}")
        scaler = StandardScaler()
        # Fit and transform
        df_processed[numerical_features_to_scale] = scaler.fit_transform(df_processed[numerical_features_to_scale])
        print("Numerical features standardized.")
        print(df_processed[numerical_features_to_scale].head())
    else:
        print("No numerical features identified for scaling or all are binary/target.")
    print("\n")


    # 6. Outlier Handling (Demonstration for PM2.5)
    print("--- 6. Outlier Handling Demonstration (for PM2.5) ---")
    if 'PM2.5' in df.columns: # Use original df for this demonstration before scaling
        Q1 = df['PM2.5'].quantile(0.25)
        Q3 = df['PM2.5'].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        print(f"PM2.5 - Q1: {Q1}, Q3: {Q3}, IQR: {IQR}")
        print(f"PM2.5 - Lower Bound for outliers: {lower_bound}")
        print(f"PM2.5 - Upper Bound for outliers: {upper_bound}")

        outliers = df[(df['PM2.5'] < lower_bound) | (df['PM2.5'] > upper_bound)]
        print(f"Number of potential outliers in PM2.5: {len(outliers)}")
        # In a real scenario, you'd decide whether to cap, remove, or transform these,
        # or if they are genuine extreme values to be kept.
        # For example, capping:
        # df_processed['PM2.5_capped'] = np.where(df_processed['PM2.5'] > upper_bound, upper_bound,
        #                                   np.where(df_processed['PM2.5'] < lower_bound, lower_bound, df_processed['PM2.5']))
    else:
        print("PM2.5 column not found for outlier analysis.")
    print("\n")


    print("--- Final Processed DataFrame Head (df_processed) ---")
    print(df_processed.head())
    print("\n--- Final Processed DataFrame Info ---")
    df_processed.info()

    print("\n--- Task 2 Script Finished ---")


except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
    print("Please ensure the file path is correct and the CSV file is in the specified location.")
except Exception as e:
    print(f"An error occurred: {e}")
    print("Please check your data and script carefully.")

Error: The file 'PRSA_Data_Aotizhongxin_20130301-20170228.csv' was not found.
Please ensure the file path is correct and the CSV file is in the specified location.
