In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from scipy import stats
import logging
from typing import List, Dict, Optional, Tuple

In [2]:
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

In [3]:
train_df = pd.read_csv("./input/train_dataset.csv")
test_df = pd.read_csv("./input/test_dataset.csv")
station_info_df = pd.read_csv("./input/station_info.csv")
sample_submission_df = pd.read_csv("./input/submission_sample.csv")

In [4]:
def extract_time_feature_bases(df: pd.DataFrame) -> List[str]:
    time_cols = [col for col in df.columns if re.match(r".+_\d{1,2}$", col)]
    return sorted({col.rsplit("_", 1)[0] for col in time_cols})

# Resolving missing values

In [5]:
def convert_special_values_to_nan(df, special_values=[-9999, -999, -99, 9999]):
    new_df = df.copy()
    numeric_cols = new_df.select_dtypes(include=np.number).columns
    for value in special_values:
        count = (new_df[numeric_cols] == value).sum().sum()
        if count > 0:
            new_df[numeric_cols] = new_df[numeric_cols].replace(value, np.nan)
    return new_df

In [6]:
def smart_median_imputation(df, feature_cols, group_cols=['station']):
    result = df.copy()
    valid_features = [col for col in feature_cols if col in result.columns and result[col].isna().any()]

    for feature in valid_features:
        for i in range(len(group_cols), 0, -1):
            grp = group_cols[:i]
            if all(g in result.columns for g in grp):
                result[feature] = result.groupby(grp, observed=False)[feature].transform(lambda x: x.fillna(x.median()) if x.notna().any() else x)

        if result[feature].isna().any():
            global_median = result[feature].median()
            fallback = 0 if pd.isna(global_median) else global_median
            result[feature] = result[feature].fillna(fallback)
            if pd.isna(global_median) and result[feature].isna().any():
                print(f"Warning: Could not impute column {feature} with median, filled with 0.")

    return result


In [7]:
def domain_specific_imputation(df: pd.DataFrame, feature_prefix: str) -> pd.DataFrame:
    df = df.copy()

    is_changma = df['month'].isin([6, 7])

    if feature_prefix == 'wind_direction':
        cols = [f'{feature_prefix}_{h}' for h in range(24) if f'{feature_prefix}_{h}' in df.columns]
        for col in cols:
            mode_val = df.groupby('station')[col].transform(lambda x: x.mode().iloc[0] if not x.mode().empty else 0)
            df[col] = df[col].fillna(mode_val).fillna(0)

    elif feature_prefix == 'precipitation':
        cols = [f'{feature_prefix}_{h}' for h in range(24) if f'{feature_prefix}_{h}' in df.columns]
        for col in cols:
            med = df[is_changma].groupby(['station', 'month'])[col].transform('median')
            df.loc[is_changma, col] = df.loc[is_changma, col].fillna(med)
            df.loc[~is_changma, col] = df.loc[~is_changma, col].fillna(0)

    return df

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

def missing_values_pipeline(df: pd.DataFrame,
                            n_neighbors: int = 5,
                            max_interpolation_gap: int = 8) -> pd.DataFrame:
    df_imputed = df.copy()

    # 0. Date parsing & index setting
    df_imputed['date'] = df_imputed['date'].astype(str)
    mask_mmdd = df_imputed['date'].str.count('-') == 1
    df_imputed.loc[mask_mmdd, 'date'] = '2025-' + df_imputed.loc[mask_mmdd, 'date']
    df_imputed['date'] = pd.to_datetime(
        df_imputed['date'], format='%Y-%m-%d', errors='coerce'
    )
    df_imputed = df_imputed[df_imputed['date'].notna()]
    df_imputed.set_index('date', inplace=True)

    # Temporal features
    df_imputed['month'] = df_imputed.index.month
    df_imputed['day_of_year'] = df_imputed.index.dayofyear

    # 1. Special values to NaN
    df_imputed = convert_special_values_to_nan(df_imputed)

    # 2. Numeric columns selection
    numeric = df_imputed.select_dtypes(include=np.number).columns.tolist()
    exclude = ['id','station','station_name','latitude','longitude','elev','target']
    cols_to_impute = [c for c in numeric if c not in exclude]

    # 3. Station-wise linear + time interpolation
    df_imputed[cols_to_impute] = (
        df_imputed
        .groupby('station_name')[cols_to_impute]
        .transform(lambda x: x.interpolate(
            method='linear', limit=max_interpolation_gap, limit_direction='both'
        ))
    )
    df_imputed[cols_to_impute] = df_imputed[cols_to_impute].interpolate(
        method='time', limit=max_interpolation_gap, limit_direction='both'
    )
    print("Step 1-2: Linear + time interpolation complete.")

    # 4. Domain-specific imputation
    df_imputed = domain_specific_imputation(df_imputed, 'wind_direction')
    df_imputed = domain_specific_imputation(df_imputed, 'precipitation')
    print("Step 3: Domain-specific imputation complete.")

        # 5. KNN Imputation (skip all-NaN columns)
    need_knn = [c for c in cols_to_impute if df_imputed[c].isna().any()]
    # Exclude columns that are entirely NaN
    all_nan = [c for c in need_knn if df_imputed[c].notna().sum() == 0]
    need_knn = [c for c in need_knn if c not in all_nan]

    if need_knn:
        print(f"Step 4: Applying KNN Imputation to {len(need_knn)} columns (excluded {len(all_nan)} all-NaN cols).")
        # Scale before KNN
        scaler = StandardScaler()
        scaled = scaler.fit_transform(df_imputed[need_knn])
        df_scaled = pd.DataFrame(scaled, index=df_imputed.index, columns=need_knn)
        # Apply KNNImputer
        knn = KNNImputer(n_neighbors=n_neighbors, weights='distance')
        imputed_array = knn.fit_transform(df_scaled)
        # Inverse scale and assign
        df_imputed.loc[:, need_knn] = scaler.inverse_transform(imputed_array)
        print("Step 4: KNN Imputation complete.")
    else:
        print("Step 4: No columns require KNN Imputation.")

    # 6. Fallback median imputation
    need_med = [c for c in cols_to_impute if df_imputed[c].isna().any()]
    if need_med:
        print(f"Step 5: Applying fallback median imputation to {len(need_med)} columns.")
        df_imputed = smart_median_imputation(
            df_imputed, need_med, group_cols=['station_name']
        )
        df_imputed[need_med] = df_imputed[need_med].fillna(0)
        print("Step 5: Fallback median imputation complete.")
    else:
        print("Step 5: No columns require fallback median imputation.")

    # 7. Time-slot temperature imputation
    slots = ['morning','afternoon','evening','night']
    for slot in slots:
        col = f"temp_{slot}_avg"
        if col in df_imputed.columns:
            df_imputed[col] = (
                df_imputed
                .groupby('station_name')[col]
                .transform(lambda x: x.interpolate(
                    method='time', limit=3, limit_direction='both'
                ))
            )
            df_imputed[col] = df_imputed[col].fillna(method='ffill').fillna(method='bfill')
    print("Step 6: Time-slot temperature imputation complete.")

    # 8. Dew point & humidity specialized imputation
    for col in ['dew_point_avg','humidity_avg']:
        if col in df_imputed.columns:
            df_imputed[col] = (
                df_imputed
                .groupby(['station_name','month'])[col]
                .transform(lambda x: x.fillna(x.median()))
            )
            df_imputed[col] = df_imputed[col].interpolate(
                method='time', limit=3, limit_direction='both'
            )
    print("Step 7: Dew point & humidity imputation complete.")

    # 9. Cleanup and reset index
    df_imputed = df_imputed.reset_index().drop(columns=['day_of_year'], errors='ignore')
    
    return df_imputed



In [9]:
train_df_imputed = missing_values_pipeline(train_df)
test_df_imputed = missing_values_pipeline(test_df)

Step 1-2: Linear + time interpolation complete.
Step 3: Domain-specific imputation complete.
Step 4: Applying KNN Imputation to 77 columns (excluded 8 all-NaN cols).
Step 4: KNN Imputation complete.
Step 5: Applying fallback median imputation to 8 columns.
Step 5: Fallback median imputation complete.
Step 6: Time-slot temperature imputation complete.
Step 7: Dew point & humidity imputation complete.


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


Step 1-2: Linear + time interpolation complete.
Step 3: Domain-specific imputation complete.
Step 4: Applying KNN Imputation to 24 columns (excluded 9 all-NaN cols).
Step 4: KNN Imputation complete.
Step 5: Applying fallback median imputation to 9 columns.
Step 5: Fallback median imputation complete.
Step 6: Time-slot temperature imputation complete.
Step 7: Dew point & humidity imputation complete.


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)


In [10]:
train_df_imputed.head()

Unnamed: 0,date,id,station,station_name,cloud_cover_0,cloud_cover_1,cloud_cover_10,cloud_cover_11,cloud_cover_12,cloud_cover_13,...,wind_speed_3,wind_speed_4,wind_speed_5,wind_speed_6,wind_speed_7,wind_speed_8,wind_speed_9,climatology_temp,target,month
0,2025-01-01,0,98,동두천,0.0,0.0,9.0,0.0,3.0,3.0,...,0.6,0.3,0.7,0.6,0.7,0.8,0.1,-2.707143,-3.992857,1
1,2025-01-02,1,98,동두천,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.0,1.1,1.3,0.5,0.9,0.4,-3.646429,-1.653571,1
2,2025-01-03,2,98,동두천,0.0,0.0,0.0,0.0,0.0,0.0,...,1.5,0.8,0.8,0.9,1.0,1.1,0.1,-2.694643,-0.005357,1
3,2025-01-04,3,98,동두천,0.0,0.0,2.0,0.0,0.0,1.0,...,0.3,0.5,0.2,0.5,1.3,0.5,0.2,-2.501786,-0.898214,1
4,2025-01-05,4,98,동두천,0.0,0.0,0.0,0.0,0.0,0.0,...,1.1,1.6,1.4,1.8,0.5,1.1,0.6,-2.625,-1.775,1


In [11]:
train_df_imputed.isna().sum().sum() == test_df_imputed.isna().sum().sum() == 0

np.True_

In [12]:
print(train_df_imputed.shape, test_df_imputed.shape)

(13120, 343) (3001, 342)


In [13]:
train_df_imputed.to_csv("./input/processed/train_df_imputed.csv", index=False)
test_df_imputed.to_csv("./input/processed/test_df_imputed.csv", index=False)