In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm.auto import tqdm

In [32]:
train.columns

Index(['year', 'month', 'day', 'time', 'minute', 'stn_id', 'ws10_deg',
       'ws10_ms', 'ta', 're', 'hm', 'sun10', 'ts', 'vis1', 'class', 'datetime',
       'season', 'ftt', 'ett', 'ws10_deg_diff', 'ws10_ms_diff', 'ta_diff',
       're_diff', 'hm_diff', 'sun10_diff', 'ts_diff', 'vis1_diff'],
      dtype='object')

In [2]:
DATA = Path("../../data")

In [3]:
train = pd.read_csv(DATA / "fog_train.csv")
test = pd.read_csv(DATA / "fog_test.csv")

In [4]:
del train["Unnamed: 0"]
del test["Unnamed: 0"]

In [5]:
# column 명에서 파일명 제외

train.columns = train.columns.str.replace("fog_train.", "")
test.columns = test.columns.str.replace("fog_test.", "")

In [6]:
# J 년도 11월 3일 18시 20분 EC지역 없음
new_data = dict(zip(train.columns, ["J", 11, 3, 18, 20, "EC", np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]))
fill_df = pd.DataFrame(new_data, index=[0])
train = pd.concat([train, fill_df], ignore_index=True)

In [7]:
import datetime

def create_datetime(data):
    if data[f"year"] == 'I' :
        return datetime.datetime(year=2020, month=data[f'month'], day=data[f'day'], 
                                 hour = data[f'time'], minute=data[f'minute'])
    elif data[f"year"] == 'J' :
        return datetime.datetime(year=2022, month=data[f'month'], day=data[f'day'], 
                                 hour = data[f'time'], minute=data[f'minute'])
    elif data[f"year"] == 'K' :
        return datetime.datetime(year=2024, month=data[f'month'], day=data[f'day'], 
                                 hour = data[f'time'], minute=data[f'minute'])
    elif data[f"year"] == 'L' :
        return datetime.datetime(year=2026, month=data[f'month'], day=data[f'day'], 
                                 hour = data[f'time'], minute=data[f'minute'])
    else :
        return np.nan

In [8]:
train["datetime"] = train.apply(create_datetime, axis=1)
test["datetime"] = test.apply(create_datetime, axis=1)

In [9]:
train = train.sort_values(by=["stn_id", "datetime"]).reset_index(drop=True)

In [11]:
# 결측값 처리

null_list = [-99, -99., -99.9, -999]
train[train.isin(null_list)] = np.nan
test[test.isin(null_list)] = np.nan

In [12]:
# vis1(시정 거리)는 시정계에 따라 최대 관측 거리가 달라짐 이에 따라 20000 이상인 지점을 20000으로 보정하는 작업을 거침

train["vis1"] = train["vis1"].apply(lambda x : 20000 if x > 20000 else x)
# test에는 vis1이 없음

In [13]:
def make_diff(data, mode='train'):
    if mode == 'train':
        target = ["ws10_deg", "ws10_ms", "ta", "re", "hm", "sun10", "ts", "vis1"]
    elif mode == 'test':
        target = ["ws10_deg", "ws10_ms", "ta", "re", "hm", "sun10", "ts"]
    else :
        raise
    
    points = data["stn_id"].unique() # 각 지점별 처리를 위한 지점 저장
    concated_df = pd.DataFrame()
    for point in tqdm(points): # 각 지점별 처리
        target_df = data[data["stn_id"]==f"{point}"].copy().reset_index(drop=True)
        for col in target:
            target_df[f"{col}_diff"] = target_df[f"{col}"].diff()
            target_df.loc[0, f"{col}_diff"] = 0.0 # 첫 차분값 0.0으로 설정
        concated_df = pd.concat([concated_df, target_df], axis=0).reset_index(drop=True)
    return concated_df

In [14]:
def make_season(month):
    if month >= 3 and month <= 5:
        return 'spring'
    
    elif month >= 6 and month <= 8:
        return 'summer'
    
    elif month >= 9 and month <= 11:
        return 'autumn'
    
    elif month < 3 or month > 11:
        return 'winter'
    else:
        return np.nan

In [15]:
def make_four_time_type(time):
    if 0 <= time < 6:
        return "dawn"
    elif 6 <= time < 12:
        return "morning"
    elif 12 <= time < 18:
        return "afternoon"
    elif 18 <= time < 23:
        return "night"
    else:
        return np.nan

In [16]:
def make_eight_time_type(time):
    if 0 <= time < 3:
        return "early dawn"
    elif 3 <= time < 6:
        return "late dawn"
    elif 6 <= time < 9:
        return "early morning"
    elif 9 <= time < 12:
        return "late morning"
    elif 12 <= time < 15:
        return "early afternoon"
    elif 15 <= time < 18:
        return "late afternoon"
    elif 18 <= time < 21:
        return "early night"
    elif 21<= time <=23:
        return "late night"
    else:
        return np.nan

In [17]:
train["season"] = train["month"].apply(make_season)
test["season"] = test["month"].apply(make_season)

In [18]:
train["ftt"] = train["time"].apply(make_four_time_type)
train["ett"] = train["time"].apply(make_eight_time_type)
test["ftt"] = test["time"].apply(make_four_time_type)
test["ett"] = test["time"].apply(make_eight_time_type)

In [19]:
train = make_diff(data=train, mode='train')
test = make_diff(data=test, mode='test')

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [20]:
# vis1 이상치 결측 처리 (이상치 기준은 diff(이전값과 현재값의 차분)이 2000(2km)이상인 것) : 3156459개 중 373315개 약 전체 데이터의 11% 정도

train.loc[train["vis1_diff"].abs() >= 2000, "vis1"] = np.nan

In [21]:
# ta 이상치 결측 처리 (이상치 기준은 diff(이전값과 현재값의 차분)이 3.0 이상인 것) : 3156459개 중 150개 약 전체 데이터의 0.004% 정도 -> 상식적으로 10분 평균 기온 변화량이 3.0은 일어나지 않을 것이라 판단하 결측 처리를 진행함

train.loc[train["ta_diff"].abs() >= 3.0, "ta"] = np.nan

In [22]:
# hm 이상치 결측 처리 (이상치 가준은 0.0 인것을 이상으로 보고 이상치 처리) : 1건 존재

train.loc[train["hm"]==0.0, "hm"] = np.nan

In [24]:
# sun10 이상치 결측 처리 (이상치 기준은 21시~00시(late night) , 00시~03시(early dawn) 인것은 새벽이기 때문에 일조량이 없을 것이라고 생각해 모두 0으로 처리함, 03시~06시(late dawn)는 여름을 제외하고 0으로 처리), 여름에는 일출이 5시 부터 시작 즉 3시, 4시는 0으로 해도 무방
train.loc[(train["ett"]=='late night') | (train["ett"]=='early dawn'), "sun10"] = 0
train.loc[(train["time"]=='late dawn') & (train["season"]!='summer'), "sun10"] = 0
train.loc[(train["time"].isin([3, 4])) & (train["season"]=='summer'), "sun10"] = 0

In [None]:
# ts

In [9]:
train['지점번호1'], train['지점번호2'] = train['지점번호'].str.slice(0, 1), train['지점번호'].str.slice(1, 2)
test['지점번호1'], test['지점번호2'] = test['지점번호'].str.slice(0, 1), test['지점번호'].str.slice(1, 2)

In [10]:
# 시정거리 20000 이상을 outliar로 보고 20000으로 보정 (기상청 시정거리 데이터에 근거하면 시정거리 20km 이상은 20 이상으로 표기함)
train["10분평균시정"] = train["10분평균시정"].apply(lambda x: 20000 if x >= 20000 else x)

In [11]:
train["NaN유무"] = train.isna().any(axis=1).astype(int)

In [12]:
test["NaN유무"] = test.iloc[:, :-2].isna().any(axis=1).astype(int)

In [13]:
train[train["NaN유무"]==1]

Unnamed: 0,년도,월,일,시간,분,지점번호,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도,10분평균시정,시정구간,년월일시분,지점번호1,지점번호2,NaN유무
31,I,1,1,5,20,AA,,,-5.0,0.0,55.6,0.00,-2.2,20000.0,4.0,2020-01-01 05:20:00,A,A,1
32,I,1,1,5,30,AA,,,-5.0,0.0,56.2,0.00,-2.2,20000.0,4.0,2020-01-01 05:30:00,A,A,1
33,I,1,1,5,40,AA,,,-5.0,0.0,56.5,0.00,-2.1,18219.0,4.0,2020-01-01 05:40:00,A,A,1
34,I,1,1,5,50,AA,,,-4.9,0.0,57.3,0.00,-2.1,17846.0,4.0,2020-01-01 05:50:00,A,A,1
35,I,1,1,6,0,AA,,,-4.8,0.0,57.5,0.00,-2.1,20000.0,4.0,2020-01-01 06:00:00,A,A,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3155927,K,12,28,7,10,EC,307.9,4.3,2.0,0.0,62.6,0.00,-1.4,,,2022-12-28 07:10:00,E,C,1
3155962,K,12,28,13,0,EC,294.5,6.1,6.7,0.0,41.3,0.32,15.2,,,2022-12-28 13:00:00,E,C,1
3156101,K,12,29,12,10,EC,16.3,3.5,4.7,0.0,43.2,0.34,14.6,,,2022-12-29 12:10:00,E,C,1
3156136,K,12,29,18,0,EC,314.4,8.8,3.5,0.0,42.0,0.00,1.6,,,2022-12-29 18:00:00,E,C,1


In [14]:
test[test["NaN유무"]==1]

Unnamed: 0,년도,월,일,시간,분,지점번호,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도,시정구간,년월일시분,지점번호1,지점번호2,NaN유무
0,L,1,1,0,0,AI,329.5,0.6,-2.5,0.0,87.5,0.0,-1.6,,2024-01-01 00:00:00,A,I,1
1,L,1,1,0,10,AI,321.8,1.2,-2.5,0.0,88.2,0.0,-1.6,,2024-01-01 00:10:00,A,I,1
2,L,1,1,0,20,AI,0.4,0.4,-2.5,0.0,88.6,0.0,-1.7,,2024-01-01 00:20:00,A,I,1
3,L,1,1,0,30,AI,323.6,0.7,-2.6,0.0,88.7,0.0,-1.6,,2024-01-01 00:30:00,A,I,1
4,L,1,1,0,40,AI,208.4,0.2,-2.6,0.0,89.0,0.0,-1.6,,2024-01-01 00:40:00,A,I,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262795,L,12,31,23,10,ED,338.8,6.1,4.5,0.0,71.3,0.0,1.9,,2024-12-31 23:10:00,E,D,1
262796,L,12,31,23,20,ED,335.2,5.8,4.6,0.0,71.4,0.0,1.9,,2024-12-31 23:20:00,E,D,1
262797,L,12,31,23,30,ED,337.1,5.3,4.5,0.0,72.1,0.0,1.8,,2024-12-31 23:30:00,E,D,1
262798,L,12,31,23,40,ED,322.0,3.7,4.3,0.0,72.5,0.0,1.4,,2024-12-31 23:40:00,E,D,1


In [15]:
train

Unnamed: 0,년도,월,일,시간,분,지점번호,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도,10분평균시정,시정구간,년월일시분,지점번호1,지점번호2,NaN유무
0,I,1,1,0,10,AA,0.0,0.0,-6.4,0.0,38.9,0.0,-2.8,20000.0,4.0,2020-01-01 00:10:00,A,A,0
1,I,1,1,0,20,AA,0.0,0.0,-6.3,0.0,37.9,0.0,-2.7,20000.0,4.0,2020-01-01 00:20:00,A,A,0
2,I,1,1,0,30,AA,0.0,0.0,-6.3,0.0,40.0,0.0,-2.6,20000.0,4.0,2020-01-01 00:30:00,A,A,0
3,I,1,1,0,40,AA,27.8,0.4,-6.2,0.0,39.5,0.0,-2.6,20000.0,4.0,2020-01-01 00:40:00,A,A,0
4,I,1,1,0,50,AA,59.7,0.5,-6.1,0.0,39.8,0.0,-2.5,20000.0,4.0,2020-01-01 00:50:00,A,A,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3156455,K,12,31,23,10,EC,305.3,7.7,2.5,0.0,50.2,0.0,-0.9,20000.0,4.0,2022-12-31 23:10:00,E,C,0
3156456,K,12,31,23,20,EC,293.8,5.7,2.3,0.0,50.1,0.0,-1.3,20000.0,4.0,2022-12-31 23:20:00,E,C,0
3156457,K,12,31,23,30,EC,274.2,4.9,2.2,0.0,51.0,0.0,-1.4,20000.0,4.0,2022-12-31 23:30:00,E,C,0
3156458,K,12,31,23,40,EC,270.3,4.6,2.1,0.0,51.7,0.0,-1.6,20000.0,4.0,2022-12-31 23:40:00,E,C,0


In [16]:
test

Unnamed: 0,년도,월,일,시간,분,지점번호,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도,시정구간,년월일시분,지점번호1,지점번호2,NaN유무
0,L,1,1,0,0,AI,329.5,0.6,-2.5,0.0,87.5,0.0,-1.6,,2024-01-01 00:00:00,A,I,1
1,L,1,1,0,10,AI,321.8,1.2,-2.5,0.0,88.2,0.0,-1.6,,2024-01-01 00:10:00,A,I,1
2,L,1,1,0,20,AI,0.4,0.4,-2.5,0.0,88.6,0.0,-1.7,,2024-01-01 00:20:00,A,I,1
3,L,1,1,0,30,AI,323.6,0.7,-2.6,0.0,88.7,0.0,-1.6,,2024-01-01 00:30:00,A,I,1
4,L,1,1,0,40,AI,208.4,0.2,-2.6,0.0,89.0,0.0,-1.6,,2024-01-01 00:40:00,A,I,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262795,L,12,31,23,10,ED,338.8,6.1,4.5,0.0,71.3,0.0,1.9,,2024-12-31 23:10:00,E,D,1
262796,L,12,31,23,20,ED,335.2,5.8,4.6,0.0,71.4,0.0,1.9,,2024-12-31 23:20:00,E,D,1
262797,L,12,31,23,30,ED,337.1,5.3,4.5,0.0,72.1,0.0,1.8,,2024-12-31 23:30:00,E,D,1
262798,L,12,31,23,40,ED,322.0,3.7,4.3,0.0,72.5,0.0,1.4,,2024-12-31 23:40:00,E,D,1


In [17]:
train = train.set_index("년월일시분")
test = test.set_index("년월일시분")

In [18]:
train

Unnamed: 0_level_0,년도,월,일,시간,분,지점번호,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도,10분평균시정,시정구간,지점번호1,지점번호2,NaN유무
년월일시분,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2020-01-01 00:10:00,I,1,1,0,10,AA,0.0,0.0,-6.4,0.0,38.9,0.0,-2.8,20000.0,4.0,A,A,0
2020-01-01 00:20:00,I,1,1,0,20,AA,0.0,0.0,-6.3,0.0,37.9,0.0,-2.7,20000.0,4.0,A,A,0
2020-01-01 00:30:00,I,1,1,0,30,AA,0.0,0.0,-6.3,0.0,40.0,0.0,-2.6,20000.0,4.0,A,A,0
2020-01-01 00:40:00,I,1,1,0,40,AA,27.8,0.4,-6.2,0.0,39.5,0.0,-2.6,20000.0,4.0,A,A,0
2020-01-01 00:50:00,I,1,1,0,50,AA,59.7,0.5,-6.1,0.0,39.8,0.0,-2.5,20000.0,4.0,A,A,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-31 23:10:00,K,12,31,23,10,EC,305.3,7.7,2.5,0.0,50.2,0.0,-0.9,20000.0,4.0,E,C,0
2022-12-31 23:20:00,K,12,31,23,20,EC,293.8,5.7,2.3,0.0,50.1,0.0,-1.3,20000.0,4.0,E,C,0
2022-12-31 23:30:00,K,12,31,23,30,EC,274.2,4.9,2.2,0.0,51.0,0.0,-1.4,20000.0,4.0,E,C,0
2022-12-31 23:40:00,K,12,31,23,40,EC,270.3,4.6,2.1,0.0,51.7,0.0,-1.6,20000.0,4.0,E,C,0


In [19]:
test

Unnamed: 0_level_0,년도,월,일,시간,분,지점번호,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도,시정구간,지점번호1,지점번호2,NaN유무
년월일시분,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2024-01-01 00:00:00,L,1,1,0,0,AI,329.5,0.6,-2.5,0.0,87.5,0.0,-1.6,,A,I,1
2024-01-01 00:10:00,L,1,1,0,10,AI,321.8,1.2,-2.5,0.0,88.2,0.0,-1.6,,A,I,1
2024-01-01 00:20:00,L,1,1,0,20,AI,0.4,0.4,-2.5,0.0,88.6,0.0,-1.7,,A,I,1
2024-01-01 00:30:00,L,1,1,0,30,AI,323.6,0.7,-2.6,0.0,88.7,0.0,-1.6,,A,I,1
2024-01-01 00:40:00,L,1,1,0,40,AI,208.4,0.2,-2.6,0.0,89.0,0.0,-1.6,,A,I,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-31 23:10:00,L,12,31,23,10,ED,338.8,6.1,4.5,0.0,71.3,0.0,1.9,,E,D,1
2024-12-31 23:20:00,L,12,31,23,20,ED,335.2,5.8,4.6,0.0,71.4,0.0,1.9,,E,D,1
2024-12-31 23:30:00,L,12,31,23,30,ED,337.1,5.3,4.5,0.0,72.1,0.0,1.8,,E,D,1
2024-12-31 23:40:00,L,12,31,23,40,ED,322.0,3.7,4.3,0.0,72.5,0.0,1.4,,E,D,1


In [20]:
points = train["지점번호1"].unique()
points

array(['A', 'B', 'C', 'D', 'E'], dtype=object)

In [21]:
train_fill_columns = ['10분평균풍향', '10분평균풍속', '10분평균기온', '강수유무','10분평균상대습도', 
                '10분일사량합', '10분평균지면온도', '10분평균시정']

In [22]:
# KNN Imputate : 선형성을 잘 반영하지 못하는 것 같아 폐기

# from sklearn.impute import KNNImputer
# from sklearn.preprocessing import RobustScaler
# from tqdm.auto import tqdm

# train_fill_columns = ['10분평균풍향', '10분평균풍속', '10분평균기온', '강수유무','10분평균상대습도', 
#                 '10분일사량합', '10분평균지면온도', '10분평균시정']
# test_fill_columns = ['10분평균풍향', '10분평균풍속', '10분평균기온', '강수유무','10분평균상대습도', 
#                 '10분일사량합', '10분평균지면온도']
# store_columns = ['년월일시분', '년도', '월', '일', '시간', '분', '지점번호', '지점번호1', '지점번호2', 'NaN유무', '시정구간']

# train_impute_list = []
# test_impute_list = []
# n = len(points)

# impute_iterator = tqdm(enumerate(points), total=n, desc=f"validation")
# for i, point in impute_iterator:
#     train_select_data = train.query(f"지점번호1=='{point}'")[train_fill_columns]
#     test_select_data = test.query(f"지점번호1=='{point}'")[test_fill_columns]
    
#     train_store_data = train.query(f"지점번호1=='{point}'")[store_columns].reset_index(drop=True)
#     test_store_data = test.query(f"지점번호1=='{point}'")[store_columns].reset_index(drop=True)
    
#     # scailing
#     train_scaler = RobustScaler()
#     train_scaling_data = train_scaler.fit_transform(train_select_data)
#     train_scaling_df = pd.DataFrame(train_scaling_data, columns=train_fill_columns)
    
#     test_scaler = RobustScaler()
#     test_scaler.fit(train_select_data[test_fill_columns])
#     test_scaling_data = test_scaler.transform(test_select_data)
#     test_scaling_df = pd.DataFrame(test_scaling_data, columns=test_fill_columns)
    
#     # imputer
#     train_imputer = KNNImputer(n_neighbors=6) # 1시간
#     train_filled_data = train_imputer.fit_transform(train_scaling_df)
#     train_filled_data = train_scaler.inverse_transform(train_filled_data)
#     train_filled_df = pd.DataFrame(train_filled_data, columns=train_fill_columns)
#     train_impute_list.append(pd.concat([train_store_data, train_filled_df], axis=1, join='outer'))
    
#     test_imputer = KNNImputer(n_neighbors=6)
#     test_imputer.fit(test_scaling_df[test_fill_columns])
#     test_filled_data = test_imputer.transform(test_scaling_df)
#     test_filled_data = test_scaler.inverse_transform(test_filled_data)
#     test_filled_df = pd.DataFrame(test_filled_data, columns=test_fill_columns)
#     test_impute_list.append(pd.concat([test_store_data, test_filled_df], axis=1, join='outer'))
    
#     if i == 0:
#         continue
#     else:
#         train_impute_list[0] = pd.concat([train_impute_list[0], train_impute_list[i]], axis=0).reset_index(drop=True)
#         test_impute_list[0] = pd.concat([test_impute_list[0], test_impute_list[i]], axis=0).reset_index(drop=True)

In [23]:
# Time-based linear interpolation

# from tqdm.auto import tqdm

# train_fill_columns = ['10분평균풍향', '10분평균풍속', '10분평균기온', '강수유무','10분평균상대습도', 
#                 '10분일사량합', '10분평균지면온도', '10분평균시정']
# test_fill_columns = ['10분평균풍향', '10분평균풍속', '10분평균기온', '강수유무','10분평균상대습도', 
#                 '10분일사량합', '10분평균지면온도']
# store_columns = ['년도', '월', '일', '시간', '분', '지점번호', '지점번호1', '지점번호2', 'NaN유무', '시정구간']

# train_impute_list = []
# test_impute_list = []
# n = len(points)

# impute_iterator = tqdm(enumerate(points), total=n, desc=f"validation")
# for i, point in impute_iterator:
#     train_select_data = train.query(f"지점번호1=='{point}'")[train_fill_columns]
#     test_select_data = test.query(f"지점번호1=='{point}'")[test_fill_columns]
    
#     train_store_data = train.query(f"지점번호1=='{point}'")[store_columns].reset_index(drop=True)
#     test_store_data = test.query(f"지점번호1=='{point}'")[store_columns].reset_index(drop=True)
    
#     # imputer
#     train_filled_data = train_select_data.interpolate(method='time')
#     train_filled_df = pd.DataFrame(train_filled_data, columns=train_fill_columns).reset_index()
#     train_impute_list.append(pd.concat([train_store_data, train_filled_df], axis=1, join='outer'))
    
#     test_filled_data = test_select_data.interpolate(method='time')
#     test_filled_df = pd.DataFrame(test_filled_data, columns=test_fill_columns).reset_index()
#     test_impute_list.append(pd.concat([test_store_data, test_filled_df], axis=1, join='outer'))
    
#     if i == 0:
#         continue
#     else:
#         train_impute_list[0] = pd.concat([train_impute_list[0], train_impute_list[i]], axis=0).reset_index(drop=True)
#         test_impute_list[0] = pd.concat([test_impute_list[0], test_impute_list[i]], axis=0).reset_index(drop=True)

In [None]:
from tqdm.auto import tqdm
from statsmodels.tsa.holtwinters import ExponentialSmoothing

train_fill_columns = ['10분평균풍향', '10분평균풍속', '10분평균기온', '강수유무','10분평균상대습도', 
                '10분일사량합', '10분평균지면온도', '10분평균시정']
test_fill_columns = ['10분평균풍향', '10분평균풍속', '10분평균기온', '강수유무','10분평균상대습도', 
                '10분일사량합', '10분평균지면온도']
store_columns = ['년도', '월', '일', '시간', '분', '지점번호', '지점번호1', '지점번호2', 'NaN유무', '시정구간']

train_impute_list = []
test_impute_list = []
n = len(points)

impute_iterator = tqdm(enumerate(points), total=n, desc=f"validation")
for i, point in impute_iterator:
    train_select_data = train.query(f"지점번호1=='{point}'")[train_fill_columns]
    test_select_data = test.query(f"지점번호1=='{point}'")[test_fill_columns]
    
    train_store_data = train.query(f"지점번호1=='{point}'")[store_columns].reset_index(drop=True)
    test_store_data = test.query(f"지점번호1=='{point}'")[store_columns].reset_index(drop=True)
    
    # imputer
    for col in train_fill_columns:
        imputer = ExponentialSmoothing(train_select_data[col], trend='add', seasonal='add', seasonal_periods= 6 * 24) # 6 * 24 = 1일 단위 주기
        imputer_fit = imputer.fit()
        train_filled_data = imputer_fit.fittedvalues
        train_filled_df = pd.DataFrame(train_filled_data, columns=col).reset_index()
        train_impute_list.append(pd.concat([train_store_data, train_filled_df], axis=1, join='outer'))
        
        if col != "10분평균시정":
            test_filled_data = imputer_fit.
            test_filled_df = pd.DataFrame(test_filled_data, columns=test_fill_columns).reset_index()
            test_impute_list.append(pd.concat([test_store_data, test_filled_df], axis=1, join='outer'))
    
    if i == 0:
        continue
    else:
        train_impute_list[0] = pd.concat([train_impute_list[0], train_impute_list[i]], axis=0).reset_index(drop=True)
        test_impute_list[0] = pd.concat([test_impute_list[0], test_impute_list[i]], axis=0).reset_index(drop=True)

In [167]:
train_impute_list[0]

Unnamed: 0,년도,월,일,시간,분,지점번호,지점번호1,지점번호2,NaN유무,시정구간,년월일시분,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도,10분평균시정
0,I,1,1,0,10,AA,A,A,0,4.0,2020-01-01 00:10:00,0.0,0.0,-6.4,0.0,38.9,0.0,-2.8,20000.0
1,I,1,1,0,20,AA,A,A,0,4.0,2020-01-01 00:20:00,0.0,0.0,-6.3,0.0,37.9,0.0,-2.7,20000.0
2,I,1,1,0,30,AA,A,A,0,4.0,2020-01-01 00:30:00,0.0,0.0,-6.3,0.0,40.0,0.0,-2.6,20000.0
3,I,1,1,0,40,AA,A,A,0,4.0,2020-01-01 00:40:00,27.8,0.4,-6.2,0.0,39.5,0.0,-2.6,20000.0
4,I,1,1,0,50,AA,A,A,0,4.0,2020-01-01 00:50:00,59.7,0.5,-6.1,0.0,39.8,0.0,-2.5,20000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3156455,K,12,31,23,10,EC,E,C,0,4.0,2022-12-31 23:10:00,305.3,7.7,2.5,0.0,50.2,0.0,-0.9,20000.0
3156456,K,12,31,23,20,EC,E,C,0,4.0,2022-12-31 23:20:00,293.8,5.7,2.3,0.0,50.1,0.0,-1.3,20000.0
3156457,K,12,31,23,30,EC,E,C,0,4.0,2022-12-31 23:30:00,274.2,4.9,2.2,0.0,51.0,0.0,-1.4,20000.0
3156458,K,12,31,23,40,EC,E,C,0,4.0,2022-12-31 23:40:00,270.3,4.6,2.1,0.0,51.7,0.0,-1.6,20000.0


In [168]:
train_impute_list[0][train_impute_list[0]["시정구간"].isna()]

Unnamed: 0,년도,월,일,시간,분,지점번호,지점번호1,지점번호2,NaN유무,시정구간,년월일시분,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도,10분평균시정
7645,I,2,23,2,20,AA,A,A,1,,2020-02-23 02:20:00,272.0,1.3,-0.9,0.0,61.1,0.00,-1.0,20000.0
7780,I,2,24,0,50,AA,A,A,1,,2020-02-24 00:50:00,140.7,1.2,2.1,0.0,59.5,0.00,0.8,20000.0
16053,I,4,21,11,40,AA,A,A,1,,2020-04-21 11:40:00,271.7,7.0,9.2,0.0,42.8,0.32,16.7,20000.0
16054,I,4,21,11,50,AA,A,A,1,,2020-04-21 11:50:00,276.4,6.0,9.3,0.0,43.4,0.26,19.0,11956.0
16055,I,4,21,12,0,AA,A,A,1,,2020-04-21 12:00:00,268.4,5.9,9.5,0.0,41.7,0.48,21.1,16681.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3155927,K,12,28,7,10,EC,E,C,1,,2022-12-28 07:10:00,307.9,4.3,2.0,0.0,62.6,0.00,-1.4,20000.0
3155962,K,12,28,13,0,EC,E,C,1,,2022-12-28 13:00:00,294.5,6.1,6.7,0.0,41.3,0.32,15.2,20000.0
3156101,K,12,29,12,10,EC,E,C,1,,2022-12-29 12:10:00,16.3,3.5,4.7,0.0,43.2,0.34,14.6,20000.0
3156136,K,12,29,18,0,EC,E,C,1,,2022-12-29 18:00:00,314.4,8.8,3.5,0.0,42.0,0.00,1.6,20000.0


In [169]:
train_imputed_data = train_impute_list[0].copy()
test_imputed_data = test_impute_list[0].copy()

In [170]:
train_imputed_data

Unnamed: 0,년도,월,일,시간,분,지점번호,지점번호1,지점번호2,NaN유무,시정구간,년월일시분,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도,10분평균시정
0,I,1,1,0,10,AA,A,A,0,4.0,2020-01-01 00:10:00,0.0,0.0,-6.4,0.0,38.9,0.0,-2.8,20000.0
1,I,1,1,0,20,AA,A,A,0,4.0,2020-01-01 00:20:00,0.0,0.0,-6.3,0.0,37.9,0.0,-2.7,20000.0
2,I,1,1,0,30,AA,A,A,0,4.0,2020-01-01 00:30:00,0.0,0.0,-6.3,0.0,40.0,0.0,-2.6,20000.0
3,I,1,1,0,40,AA,A,A,0,4.0,2020-01-01 00:40:00,27.8,0.4,-6.2,0.0,39.5,0.0,-2.6,20000.0
4,I,1,1,0,50,AA,A,A,0,4.0,2020-01-01 00:50:00,59.7,0.5,-6.1,0.0,39.8,0.0,-2.5,20000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3156455,K,12,31,23,10,EC,E,C,0,4.0,2022-12-31 23:10:00,305.3,7.7,2.5,0.0,50.2,0.0,-0.9,20000.0
3156456,K,12,31,23,20,EC,E,C,0,4.0,2022-12-31 23:20:00,293.8,5.7,2.3,0.0,50.1,0.0,-1.3,20000.0
3156457,K,12,31,23,30,EC,E,C,0,4.0,2022-12-31 23:30:00,274.2,4.9,2.2,0.0,51.0,0.0,-1.4,20000.0
3156458,K,12,31,23,40,EC,E,C,0,4.0,2022-12-31 23:40:00,270.3,4.6,2.1,0.0,51.7,0.0,-1.6,20000.0


In [177]:
train

Unnamed: 0_level_0,년도,월,일,시간,분,지점번호,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도,10분평균시정,시정구간,지점번호1,지점번호2,NaN유무
년월일시분,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2020-01-01 00:10:00,I,1,1,0,10,AA,0.0,0.0,-6.4,0.0,38.9,0.0,-2.8,20000.0,4.0,A,A,0
2020-01-01 00:20:00,I,1,1,0,20,AA,0.0,0.0,-6.3,0.0,37.9,0.0,-2.7,20000.0,4.0,A,A,0
2020-01-01 00:30:00,I,1,1,0,30,AA,0.0,0.0,-6.3,0.0,40.0,0.0,-2.6,20000.0,4.0,A,A,0
2020-01-01 00:40:00,I,1,1,0,40,AA,27.8,0.4,-6.2,0.0,39.5,0.0,-2.6,20000.0,4.0,A,A,0
2020-01-01 00:50:00,I,1,1,0,50,AA,59.7,0.5,-6.1,0.0,39.8,0.0,-2.5,20000.0,4.0,A,A,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-31 23:10:00,K,12,31,23,10,EC,305.3,7.7,2.5,0.0,50.2,0.0,-0.9,20000.0,4.0,E,C,0
2022-12-31 23:20:00,K,12,31,23,20,EC,293.8,5.7,2.3,0.0,50.1,0.0,-1.3,20000.0,4.0,E,C,0
2022-12-31 23:30:00,K,12,31,23,30,EC,274.2,4.9,2.2,0.0,51.0,0.0,-1.4,20000.0,4.0,E,C,0
2022-12-31 23:40:00,K,12,31,23,40,EC,270.3,4.6,2.1,0.0,51.7,0.0,-1.6,20000.0,4.0,E,C,0


In [171]:
test_imputed_data

Unnamed: 0,년도,월,일,시간,분,지점번호,지점번호1,지점번호2,NaN유무,시정구간,년월일시분,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도
0,L,1,1,0,0,AI,A,I,1,,2024-01-01 00:00:00,329.5,0.6,-2.5,0.0,87.5,0.0,-1.6
1,L,1,1,0,10,AI,A,I,1,,2024-01-01 00:10:00,321.8,1.2,-2.5,0.0,88.2,0.0,-1.6
2,L,1,1,0,20,AI,A,I,1,,2024-01-01 00:20:00,0.4,0.4,-2.5,0.0,88.6,0.0,-1.7
3,L,1,1,0,30,AI,A,I,1,,2024-01-01 00:30:00,323.6,0.7,-2.6,0.0,88.7,0.0,-1.6
4,L,1,1,0,40,AI,A,I,1,,2024-01-01 00:40:00,208.4,0.2,-2.6,0.0,89.0,0.0,-1.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262795,L,12,31,23,10,ED,E,D,1,,2024-12-31 23:10:00,338.8,6.1,4.5,0.0,71.3,0.0,1.9
262796,L,12,31,23,20,ED,E,D,1,,2024-12-31 23:20:00,335.2,5.8,4.6,0.0,71.4,0.0,1.9
262797,L,12,31,23,30,ED,E,D,1,,2024-12-31 23:30:00,337.1,5.3,4.5,0.0,72.1,0.0,1.8
262798,L,12,31,23,40,ED,E,D,1,,2024-12-31 23:40:00,322.0,3.7,4.3,0.0,72.5,0.0,1.4


In [178]:
test

Unnamed: 0_level_0,년도,월,일,시간,분,지점번호,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도,시정구간,지점번호1,지점번호2,NaN유무
년월일시분,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2024-01-01 00:00:00,L,1,1,0,0,AI,329.5,0.6,-2.5,0.0,87.5,0.0,-1.6,,A,I,1
2024-01-01 00:10:00,L,1,1,0,10,AI,321.8,1.2,-2.5,0.0,88.2,0.0,-1.6,,A,I,1
2024-01-01 00:20:00,L,1,1,0,20,AI,0.4,0.4,-2.5,0.0,88.6,0.0,-1.7,,A,I,1
2024-01-01 00:30:00,L,1,1,0,30,AI,323.6,0.7,-2.6,0.0,88.7,0.0,-1.6,,A,I,1
2024-01-01 00:40:00,L,1,1,0,40,AI,208.4,0.2,-2.6,0.0,89.0,0.0,-1.6,,A,I,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-12-31 23:10:00,L,12,31,23,10,ED,338.8,6.1,4.5,0.0,71.3,0.0,1.9,,E,D,1
2024-12-31 23:20:00,L,12,31,23,20,ED,335.2,5.8,4.6,0.0,71.4,0.0,1.9,,E,D,1
2024-12-31 23:30:00,L,12,31,23,30,ED,337.1,5.3,4.5,0.0,72.1,0.0,1.8,,E,D,1
2024-12-31 23:40:00,L,12,31,23,40,ED,322.0,3.7,4.3,0.0,72.5,0.0,1.4,,E,D,1


In [172]:
def make_class(vis):
    if 0 < vis and vis < 200 :
        return 1
    elif 200 <= vis and vis < 500 :
        return 2
    elif 500 <= vis and vis < 1000 :
        return 3
    elif 1000 <= vis :
        return 4
    else :
        return np.nan

In [173]:
train_imputed_data["시정구간"] = train_imputed_data["10분평균시정"].apply(make_class)

In [174]:
train_imputed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3156460 entries, 0 to 3156459
Data columns (total 19 columns):
 #   Column     Dtype         
---  ------     -----         
 0   년도         object        
 1   월          int64         
 2   일          int64         
 3   시간         int64         
 4   분          int64         
 5   지점번호       object        
 6   지점번호1      object        
 7   지점번호2      object        
 8   NaN유무      int64         
 9   시정구간       int64         
 10  년월일시분      datetime64[ns]
 11  10분평균풍향    float64       
 12  10분평균풍속    float64       
 13  10분평균기온    float64       
 14  강수유무       float64       
 15  10분평균상대습도  float64       
 16  10분일사량합    float64       
 17  10분평균지면온도  float64       
 18  10분평균시정    float64       
dtypes: datetime64[ns](1), float64(8), int64(6), object(4)
memory usage: 457.6+ MB


In [175]:
test_imputed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262800 entries, 0 to 262799
Data columns (total 18 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   년도         262800 non-null  object        
 1   월          262800 non-null  int64         
 2   일          262800 non-null  int64         
 3   시간         262800 non-null  int64         
 4   분          262800 non-null  int64         
 5   지점번호       262800 non-null  object        
 6   지점번호1      262800 non-null  object        
 7   지점번호2      262800 non-null  object        
 8   NaN유무      262800 non-null  int64         
 9   시정구간       0 non-null       float64       
 10  년월일시분      262800 non-null  datetime64[ns]
 11  10분평균풍향    262800 non-null  float64       
 12  10분평균풍속    262800 non-null  float64       
 13  10분평균기온    262800 non-null  float64       
 14  강수유무       262800 non-null  float64       
 15  10분평균상대습도  262800 non-null  float64       
 16  10분일사량합    262800 no

In [176]:
train_imputed_data.to_csv("fog_train_time-interpolation.csv", index=False)
test_imputed_data.to_csv("fog_test_time-interpolation.csv", index=False)