In [14]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from scipy import stats
import logging
from typing import List, Dict, Optional, Tuple

In [15]:
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

In [16]:
train_df = pd.read_csv("./input/train_dataset.csv")
test_df = pd.read_csv("./input/test_dataset.csv")
station_info_df = pd.read_csv("./input/station_info.csv")
sample_submission_df = pd.read_csv("./input/submission_sample.csv")

In [17]:
def extract_time_feature_bases(df: pd.DataFrame) -> List[str]:
    time_cols = [col for col in df.columns if re.match(r".+_\d{1,2}$", col)]
    return sorted({col.rsplit("_", 1)[0] for col in time_cols})

# Resolving missing values

In [18]:
def convert_special_values_to_nan(df, special_values=[-9999, -999, -99, 9999]):
    new_df = df.copy()
    numeric_cols = new_df.select_dtypes(include=np.number).columns
    for value in special_values:
        count = (new_df[numeric_cols] == value).sum().sum()
        if count > 0:
            new_df[numeric_cols] = new_df[numeric_cols].replace(value, np.nan)
    return new_df

In [19]:
def smart_median_imputation(df, feature_cols, group_cols=['station']):
    result = df.copy()
    valid_features = [col for col in feature_cols if col in result.columns and result[col].isna().any()]

    for feature in valid_features:
        for i in range(len(group_cols), 0, -1):
            grp = group_cols[:i]
            if all(g in result.columns for g in grp):
                result[feature] = result.groupby(grp, observed=False)[feature].transform(lambda x: x.fillna(x.median()) if x.notna().any() else x)

        if result[feature].isna().any():
            global_median = result[feature].median()
            fallback = 0 if pd.isna(global_median) else global_median
            result[feature] = result[feature].fillna(fallback)
            if pd.isna(global_median) and result[feature].isna().any():
                print(f"Warning: Could not impute column {feature} with median, filled with 0.")

    return result


In [20]:
def domain_specific_imputation(df: pd.DataFrame, feature_prefix: str) -> pd.DataFrame:
    df = df.copy()

    is_changma = df['month'].isin([6, 7])

    if feature_prefix == 'wind_direction':
        cols = [f'{feature_prefix}_{h}' for h in range(24) if f'{feature_prefix}_{h}' in df.columns]
        for col in cols:
            mode_val = df.groupby('station')[col].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else 0)
            df[col] = df[col].fillna(mode_val).fillna(0)

    elif feature_prefix == 'precipitation':
        cols = [f'{feature_prefix}_{h}' for h in range(24) if f'{feature_prefix}_{h}' in df.columns]
        for col in cols:
            med = df[is_changma].groupby(['station', 'month'])[col].transform('median')
            df.loc[is_changma, col] = df.loc[is_changma, col].fillna(med)
            df.loc[~is_changma, col] = df.loc[~is_changma, col].fillna(0)

    return df

In [21]:
def missing_values_pipeline(df: pd.DataFrame, n_neighbors=5, max_interpolation_gap=8) -> pd.DataFrame:
    df_imputed = df.copy()

    # Add month column, considering leap year
    df_imputed['month'] = df_imputed['date'].str.split('-').str[0].astype(int)
    df_imputed['day'] = df_imputed['date'].str.split('-').str[1].astype(int)
    df_imputed['day_of_year'] = (df_imputed['month'] - 1) * 30 + df_imputed['day']

    # 1. Convert special values
    df_imputed = convert_special_values_to_nan(df_imputed)

    numeric_cols = df_imputed.select_dtypes(include=np.number).columns.tolist()
    cols_to_impute = [col for col in numeric_cols if col not in ['id', 'station', 'date', 'station_name', 'latitude', 'longitude', 'elevation']]

    # 2. Interpolate small gaps within each 24-hour row
    df_imputed[cols_to_impute] = df_imputed[cols_to_impute].interpolate(
        method='linear',
        axis=1,
        limit=max_interpolation_gap,
        limit_direction='both'
    )
    print("Step 2: Row-wise interpolation complete.")

    # 3. Domain-specific imputation
    df_imputed = domain_specific_imputation(df_imputed, 'wind_direction')
    df_imputed = domain_specific_imputation(df_imputed, 'precipitation')
    print("Step 3: Domain-specific imputation complete.")

    # 4. KNN Imputation
    cols_needing_knn = [col for col in cols_to_impute if df_imputed[col].isna().any()]

    if cols_needing_knn:
        print(f"Step 4: Applying KNN Imputation to {len(cols_needing_knn)} columns...")
        # Scale data before KNN
        scaler = StandardScaler()
        # Fit scaler only on columns that will be imputed by KNN
        scaled_values = scaler.fit_transform(df_imputed[cols_needing_knn])
        df_scaled = pd.DataFrame(scaled_values, index=df_imputed.index, columns=cols_needing_knn)

        # Apply KNNImputer
        knn_imputer = KNNImputer(n_neighbors=n_neighbors, weights='distance')
        imputed_scaled_values = knn_imputer.fit_transform(df_scaled)

        imputed_values = scaler.inverse_transform(imputed_scaled_values)

        df_imputed[cols_needing_knn] = imputed_values
        print("Step 4: KNN Imputation complete.")

    else:
        print("Step 4: No columns require KNN Imputation.")

    # 5. Final Fallback Imputation (Median)
    cols_needing_fallback = [col for col in cols_to_impute if df_imputed[col].isna().any()]
    if cols_needing_fallback:
        print(f"Step 5: Applying Fallback Median Imputation to {len(cols_needing_fallback)} columns...")
        df_imputed = smart_median_imputation(df_imputed, cols_needing_fallback, group_cols=['station'])
        print("Step 5: Fallback Median Imputation complete.")

        remaining_nans = df_imputed[cols_to_impute].isna().sum().sum()
        if remaining_nans > 0:
            print(f"Warning: {remaining_nans} NaNs still remain after fallback imputation. Filling with 0.")
            df_imputed[cols_to_impute] = df_imputed[cols_to_impute].fillna(0)
    else:
        print("Step 5: No columns require Fallback Imputation.")


    df_imputed = df_imputed.drop(columns=['day', 'month', 'day_of_year'])

    return df_imputed

In [22]:
train_df_imputed = missing_values_pipeline(train_df)
test_df_imputed = missing_values_pipeline(test_df)

Step 2: Row-wise interpolation complete.
Step 3: Domain-specific imputation complete.
Step 4: Applying KNN Imputation to 95 columns...
Step 4: KNN Imputation complete.
Step 5: No columns require Fallback Imputation.
Step 2: Row-wise interpolation complete.
Step 3: Domain-specific imputation complete.
Step 4: Applying KNN Imputation to 133 columns...
Step 4: KNN Imputation complete.
Step 5: No columns require Fallback Imputation.


In [23]:
train_df_imputed.head()

Unnamed: 0,id,station,station_name,date,cloud_cover_0,cloud_cover_1,cloud_cover_10,cloud_cover_11,cloud_cover_12,cloud_cover_13,...,wind_speed_23,wind_speed_3,wind_speed_4,wind_speed_5,wind_speed_6,wind_speed_7,wind_speed_8,wind_speed_9,climatology_temp,target
0,0,98,동두천,01-01,0.0,0.0,9.0,0.0,3.0,3.0,...,2.3,0.6,0.3,0.7,0.6,0.7,0.8,0.1,-2.707143,-3.992857
1,1,98,동두천,01-02,0.0,0.0,0.0,0.0,0.0,0.0,...,0.7,0.2,0.0,1.1,1.3,0.5,0.9,0.4,-3.646429,-1.653571
2,2,98,동두천,01-03,0.0,0.0,0.0,0.0,0.0,0.0,...,0.4,1.5,0.8,0.8,0.9,1.0,1.1,0.1,-2.694643,-0.005357
3,3,98,동두천,01-04,0.0,0.0,2.0,0.0,0.0,1.0,...,0.9,0.3,0.5,0.2,0.5,1.3,0.5,0.2,-2.501786,-0.898214
4,4,98,동두천,01-05,0.0,0.0,0.0,0.0,0.0,0.0,...,1.4,1.1,1.6,1.4,1.8,0.5,1.1,0.6,-2.625,-1.775


In [24]:
train_df_imputed.isna().sum().sum() == test_df_imputed.isna().sum().sum() == 0

np.True_

In [25]:
print(train_df_imputed.shape, test_df_imputed.shape)

(13132, 342) (3004, 341)


In [26]:
train_df_imputed.to_csv("./input/processed/train_df_imputed.csv", index=False)
test_df_imputed.to_csv("./input/processed/test_df_imputed.csv", index=False)