In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [2]:
DATA = Path("../../data")

In [3]:
train = pd.read_csv(DATA / "fog_train_01.csv")
test = pd.read_csv(DATA / "fog_test_01.csv")

In [4]:
train

Unnamed: 0,년도,월,일,시간,분,지점번호,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도,10분평균시정,시정구간
0,I,1,1,0,10,AA,0.0,0.0,-6.4,0.0,38.9,0.0,-2.8,20000.0,4.0
1,I,1,1,0,20,AA,0.0,0.0,-6.3,0.0,37.9,0.0,-2.7,20000.0,4.0
2,I,1,1,0,30,AA,0.0,0.0,-6.3,0.0,40.0,0.0,-2.6,20000.0,4.0
3,I,1,1,0,40,AA,27.8,0.4,-6.2,0.0,39.5,0.0,-2.6,20000.0,4.0
4,I,1,1,0,50,AA,59.7,0.5,-6.1,0.0,39.8,0.0,-2.5,20000.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3156454,K,12,31,23,10,EC,305.3,7.7,2.5,0.0,50.2,0.0,-0.9,20000.0,4.0
3156455,K,12,31,23,20,EC,293.8,5.7,2.3,0.0,50.1,0.0,-1.3,20000.0,4.0
3156456,K,12,31,23,30,EC,274.2,4.9,2.2,0.0,51.0,0.0,-1.4,20000.0,4.0
3156457,K,12,31,23,40,EC,270.3,4.6,2.1,0.0,51.7,0.0,-1.6,20000.0,4.0


In [5]:
# # I 년도 1월 1일 00시 00분이 없으므로 각 Point 별로 추가
# fill_data = []
# for point in train["지점번호"].unique():
#     new_data = dict(zip(train.columns, ["I", 1, 1, 0, 0, point, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]))
#     fill_data.append(new_data)
    
# J 년도 11월 3일 18시 20분 EC지역 없음
new_data = dict(zip(train.columns, ["J", 11, 3, 18, 20, "EC", np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]))
# fill_data.append(new_data)

# fill_df = pd.DataFrame(fill_data)
fill_df = pd.DataFrame(new_data, index=[0])
concat_df = pd.concat([train, fill_df], ignore_index=True)

In [6]:
import datetime
def create_datetime_minute(data):
    if data["년도"] == 'I' :
        return datetime.datetime(year=2020, month=data['월'], day=data['일'], hour = data["시간"], minute=data["분"])
    elif data["년도"] == 'J' :
        return datetime.datetime(year=2021, month=data['월'], day=data['일'], hour = data["시간"], minute=data["분"])
    elif data["년도"] == 'K' :
        return datetime.datetime(year=2022, month=data['월'], day=data['일'], hour = data["시간"], minute=data["분"])
    elif data["년도"] == 'L' :
        return datetime.datetime(year=2024, month=data['월'], day=data['일'], hour = data["시간"], minute=data["분"])
    else :
        return np.nan

In [7]:
concat_df["년월일시분"] = concat_df.apply(create_datetime_minute, axis=1)
test["년월일시분"] = test.apply(create_datetime_minute, axis=1)

In [8]:
train = concat_df.sort_values(by=["지점번호", "년월일시분"]).reset_index(drop=True)

In [9]:
train['지점번호1'], train['지점번호2'] = train['지점번호'].str.slice(0, 1), train['지점번호'].str.slice(1, 2)
test['지점번호1'], test['지점번호2'] = test['지점번호'].str.slice(0, 1), test['지점번호'].str.slice(1, 2)

In [10]:
# 시정거리 20000 이상을 outliar로 보고 20000으로 보정 (기상청 시정거리 데이터에 근거하면 시정거리 20km 이상은 20 이상으로 표기함)
train["10분평균시정"] = train["10분평균시정"].apply(lambda x: 20000 if x >= 20000 else x)

In [11]:
train["NaN유무"] = train.isna().any(axis=1).astype(int)

In [12]:
test["NaN유무"] = test.iloc[:, :-2].isna().any(axis=1).astype(int)

In [13]:
train[train["NaN유무"]==1]

Unnamed: 0,년도,월,일,시간,분,지점번호,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도,10분평균시정,시정구간,년월일시분,지점번호1,지점번호2,NaN유무
31,I,1,1,5,20,AA,,,-5.0,0.0,55.6,0.00,-2.2,20000.0,4.0,2020-01-01 05:20:00,A,A,1
32,I,1,1,5,30,AA,,,-5.0,0.0,56.2,0.00,-2.2,20000.0,4.0,2020-01-01 05:30:00,A,A,1
33,I,1,1,5,40,AA,,,-5.0,0.0,56.5,0.00,-2.1,18219.0,4.0,2020-01-01 05:40:00,A,A,1
34,I,1,1,5,50,AA,,,-4.9,0.0,57.3,0.00,-2.1,17846.0,4.0,2020-01-01 05:50:00,A,A,1
35,I,1,1,6,0,AA,,,-4.8,0.0,57.5,0.00,-2.1,20000.0,4.0,2020-01-01 06:00:00,A,A,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3155927,K,12,28,7,10,EC,307.9,4.3,2.0,0.0,62.6,0.00,-1.4,,,2022-12-28 07:10:00,E,C,1
3155962,K,12,28,13,0,EC,294.5,6.1,6.7,0.0,41.3,0.32,15.2,,,2022-12-28 13:00:00,E,C,1
3156101,K,12,29,12,10,EC,16.3,3.5,4.7,0.0,43.2,0.34,14.6,,,2022-12-29 12:10:00,E,C,1
3156136,K,12,29,18,0,EC,314.4,8.8,3.5,0.0,42.0,0.00,1.6,,,2022-12-29 18:00:00,E,C,1


In [14]:
test[test["NaN유무"]==1]

Unnamed: 0,년도,월,일,시간,분,지점번호,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도,시정구간,년월일시분,지점번호1,지점번호2,NaN유무
0,L,1,1,0,0,AI,329.5,0.6,-2.5,0.0,87.5,0.0,-1.6,,2024-01-01 00:00:00,A,I,1
1,L,1,1,0,10,AI,321.8,1.2,-2.5,0.0,88.2,0.0,-1.6,,2024-01-01 00:10:00,A,I,1
2,L,1,1,0,20,AI,0.4,0.4,-2.5,0.0,88.6,0.0,-1.7,,2024-01-01 00:20:00,A,I,1
3,L,1,1,0,30,AI,323.6,0.7,-2.6,0.0,88.7,0.0,-1.6,,2024-01-01 00:30:00,A,I,1
4,L,1,1,0,40,AI,208.4,0.2,-2.6,0.0,89.0,0.0,-1.6,,2024-01-01 00:40:00,A,I,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262795,L,12,31,23,10,ED,338.8,6.1,4.5,0.0,71.3,0.0,1.9,,2024-12-31 23:10:00,E,D,1
262796,L,12,31,23,20,ED,335.2,5.8,4.6,0.0,71.4,0.0,1.9,,2024-12-31 23:20:00,E,D,1
262797,L,12,31,23,30,ED,337.1,5.3,4.5,0.0,72.1,0.0,1.8,,2024-12-31 23:30:00,E,D,1
262798,L,12,31,23,40,ED,322.0,3.7,4.3,0.0,72.5,0.0,1.4,,2024-12-31 23:40:00,E,D,1


In [15]:
train

Unnamed: 0,년도,월,일,시간,분,지점번호,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도,10분평균시정,시정구간,년월일시분,지점번호1,지점번호2,NaN유무
0,I,1,1,0,10,AA,0.0,0.0,-6.4,0.0,38.9,0.0,-2.8,20000.0,4.0,2020-01-01 00:10:00,A,A,0
1,I,1,1,0,20,AA,0.0,0.0,-6.3,0.0,37.9,0.0,-2.7,20000.0,4.0,2020-01-01 00:20:00,A,A,0
2,I,1,1,0,30,AA,0.0,0.0,-6.3,0.0,40.0,0.0,-2.6,20000.0,4.0,2020-01-01 00:30:00,A,A,0
3,I,1,1,0,40,AA,27.8,0.4,-6.2,0.0,39.5,0.0,-2.6,20000.0,4.0,2020-01-01 00:40:00,A,A,0
4,I,1,1,0,50,AA,59.7,0.5,-6.1,0.0,39.8,0.0,-2.5,20000.0,4.0,2020-01-01 00:50:00,A,A,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3156455,K,12,31,23,10,EC,305.3,7.7,2.5,0.0,50.2,0.0,-0.9,20000.0,4.0,2022-12-31 23:10:00,E,C,0
3156456,K,12,31,23,20,EC,293.8,5.7,2.3,0.0,50.1,0.0,-1.3,20000.0,4.0,2022-12-31 23:20:00,E,C,0
3156457,K,12,31,23,30,EC,274.2,4.9,2.2,0.0,51.0,0.0,-1.4,20000.0,4.0,2022-12-31 23:30:00,E,C,0
3156458,K,12,31,23,40,EC,270.3,4.6,2.1,0.0,51.7,0.0,-1.6,20000.0,4.0,2022-12-31 23:40:00,E,C,0


In [16]:
test

Unnamed: 0,년도,월,일,시간,분,지점번호,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도,시정구간,년월일시분,지점번호1,지점번호2,NaN유무
0,L,1,1,0,0,AI,329.5,0.6,-2.5,0.0,87.5,0.0,-1.6,,2024-01-01 00:00:00,A,I,1
1,L,1,1,0,10,AI,321.8,1.2,-2.5,0.0,88.2,0.0,-1.6,,2024-01-01 00:10:00,A,I,1
2,L,1,1,0,20,AI,0.4,0.4,-2.5,0.0,88.6,0.0,-1.7,,2024-01-01 00:20:00,A,I,1
3,L,1,1,0,30,AI,323.6,0.7,-2.6,0.0,88.7,0.0,-1.6,,2024-01-01 00:30:00,A,I,1
4,L,1,1,0,40,AI,208.4,0.2,-2.6,0.0,89.0,0.0,-1.6,,2024-01-01 00:40:00,A,I,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262795,L,12,31,23,10,ED,338.8,6.1,4.5,0.0,71.3,0.0,1.9,,2024-12-31 23:10:00,E,D,1
262796,L,12,31,23,20,ED,335.2,5.8,4.6,0.0,71.4,0.0,1.9,,2024-12-31 23:20:00,E,D,1
262797,L,12,31,23,30,ED,337.1,5.3,4.5,0.0,72.1,0.0,1.8,,2024-12-31 23:30:00,E,D,1
262798,L,12,31,23,40,ED,322.0,3.7,4.3,0.0,72.5,0.0,1.4,,2024-12-31 23:40:00,E,D,1


In [17]:
points = train["지점번호1"].unique()
points

array(['A', 'B', 'C', 'D', 'E'], dtype=object)

In [18]:
train_fill_columns = ['10분평균풍향', '10분평균풍속', '10분평균기온', '강수유무','10분평균상대습도', 
                '10분일사량합', '10분평균지면온도', '10분평균시정']

In [19]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from tqdm.auto import tqdm

train_fill_columns = ['10분평균풍향', '10분평균풍속', '10분평균기온', '강수유무','10분평균상대습도', 
                '10분일사량합', '10분평균지면온도', '10분평균시정']
test_fill_columns = ['10분평균풍향', '10분평균풍속', '10분평균기온', '강수유무','10분평균상대습도', 
                '10분일사량합', '10분평균지면온도']
store_columns = ['년월일시분', '년도', '월', '일', '시간', '분', '지점번호', '지점번호1', '지점번호2', 'NaN유무', '시정구간']

train_impute_list = []
test_impute_list = []
n = len(points)

impute_iterator = tqdm(enumerate(points), total=n, desc=f"validation")
for i, point in impute_iterator:
    train_select_data = train.query(f"지점번호1=='{point}'")[train_fill_columns]
    test_select_data = test.query(f"지점번호1=='{point}'")[test_fill_columns]
    
    train_store_data = train.query(f"지점번호1=='{point}'")[store_columns].reset_index(drop=True)
    test_store_data = test.query(f"지점번호1=='{point}'")[store_columns].reset_index(drop=True)
    
    # scailing
    train_scaler = MinMaxScaler()
    train_scaling_data = train_scaler.fit_transform(train_select_data)
    train_scaling_df = pd.DataFrame(train_scaling_data, columns=train_fill_columns)
    
    test_scaler = MinMaxScaler()
    test_scaler.fit(train_select_data[test_fill_columns])
    test_scaling_data = test_scaler.transform(test_select_data)
    test_scaling_df = pd.DataFrame(test_scaling_data, columns=test_fill_columns)
    
    # imputer
    train_imputer = KNNImputer(n_neighbors=6) # 1시간
    train_filled_data = train_imputer.fit_transform(train_scaling_df)
    train_filled_data = train_scaler.inverse_transform(train_filled_data)
    train_filled_df = pd.DataFrame(train_filled_data, columns=train_fill_columns)
    train_impute_list.append(pd.concat([train_store_data, train_filled_df], axis=1, join='outer'))
    
    test_imputer = KNNImputer(n_neighbors=6)
    test_imputer.fit(test_scaling_df[test_fill_columns])
    test_filled_data = test_imputer.transform(test_scaling_df)
    test_filled_data = test_scaler.inverse_transform(test_filled_data)
    test_filled_df = pd.DataFrame(test_filled_data, columns=test_fill_columns)
    test_impute_list.append(pd.concat([test_store_data, test_filled_df], axis=1, join='outer'))
    
    if i == 0:
        continue
    else:
        train_impute_list[0] = pd.concat([train_impute_list[0], train_impute_list[i]], axis=0).reset_index(drop=True)
        test_impute_list[0] = pd.concat([test_impute_list[0], test_impute_list[i]], axis=0).reset_index(drop=True)

validation:   0%|          | 0/5 [00:00<?, ?it/s]

In [20]:
# Time-based linear interpolation

# from tqdm.auto import tqdm

# train_fill_columns = ['10분평균풍향', '10분평균풍속', '10분평균기온', '강수유무','10분평균상대습도', 
#                 '10분일사량합', '10분평균지면온도', '10분평균시정']
# test_fill_columns = ['10분평균풍향', '10분평균풍속', '10분평균기온', '강수유무','10분평균상대습도', 
#                 '10분일사량합', '10분평균지면온도']
# store_columns = ['년도', '월', '일', '시간', '분', '지점번호', '지점번호1', '지점번호2', 'NaN유무', '시정구간']

# train_impute_list = []
# test_impute_list = []
# n = len(points)

# impute_iterator = tqdm(enumerate(points), total=n, desc=f"validation")
# for i, point in impute_iterator:
#     train_select_data = train.query(f"지점번호1=='{point}'")[train_fill_columns]
#     test_select_data = test.query(f"지점번호1=='{point}'")[test_fill_columns]
    
#     train_store_data = train.query(f"지점번호1=='{point}'")[store_columns].reset_index(drop=True)
#     test_store_data = test.query(f"지점번호1=='{point}'")[store_columns].reset_index(drop=True)
    
#     # imputer
#     train_filled_data = train_select_data.interpolate(method='time')
#     train_filled_df = pd.DataFrame(train_filled_data, columns=train_fill_columns).reset_index()
#     train_impute_list.append(pd.concat([train_store_data, train_filled_df], axis=1, join='outer'))
    
#     test_filled_data = test_select_data.interpolate(method='time')
#     test_filled_df = pd.DataFrame(test_filled_data, columns=test_fill_columns).reset_index()
#     test_impute_list.append(pd.concat([test_store_data, test_filled_df], axis=1, join='outer'))
    
#     if i == 0:
#         continue
#     else:
#         train_impute_list[0] = pd.concat([train_impute_list[0], train_impute_list[i]], axis=0).reset_index(drop=True)
#         test_impute_list[0] = pd.concat([test_impute_list[0], test_impute_list[i]], axis=0).reset_index(drop=True)

In [33]:
train_impute_list[0]

Unnamed: 0,년월일시분,년도,월,일,시간,분,지점번호,지점번호1,지점번호2,NaN유무,시정구간,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도,10분평균시정
0,2020-01-01 00:10:00,I,1,1,0,10,AA,A,A,0,4.0,0.0,0.0,-6.4,0.0,38.9,0.0,-2.8,20000.0
1,2020-01-01 00:20:00,I,1,1,0,20,AA,A,A,0,4.0,0.0,0.0,-6.3,0.0,37.9,0.0,-2.7,20000.0
2,2020-01-01 00:30:00,I,1,1,0,30,AA,A,A,0,4.0,0.0,0.0,-6.3,0.0,40.0,0.0,-2.6,20000.0
3,2020-01-01 00:40:00,I,1,1,0,40,AA,A,A,0,4.0,27.8,0.4,-6.2,0.0,39.5,0.0,-2.6,20000.0
4,2020-01-01 00:50:00,I,1,1,0,50,AA,A,A,0,4.0,59.7,0.5,-6.1,0.0,39.8,0.0,-2.5,20000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3156455,2022-12-31 23:10:00,K,12,31,23,10,EC,E,C,0,4.0,305.3,7.7,2.5,0.0,50.2,0.0,-0.9,20000.0
3156456,2022-12-31 23:20:00,K,12,31,23,20,EC,E,C,0,4.0,293.8,5.7,2.3,0.0,50.1,0.0,-1.3,20000.0
3156457,2022-12-31 23:30:00,K,12,31,23,30,EC,E,C,0,4.0,274.2,4.9,2.2,0.0,51.0,0.0,-1.4,20000.0
3156458,2022-12-31 23:40:00,K,12,31,23,40,EC,E,C,0,4.0,270.3,4.6,2.1,0.0,51.7,0.0,-1.6,20000.0


In [34]:
train_impute_list[0][train_impute_list[0]["시정구간"].isna()]

Unnamed: 0,년월일시분,년도,월,일,시간,분,지점번호,지점번호1,지점번호2,NaN유무,시정구간,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도,10분평균시정
7645,2020-02-23 02:20:00,I,2,23,2,20,AA,A,A,1,,272.0,1.3,-0.9,0.0,61.1,0.00,-1.0,15383.833333
7780,2020-02-24 00:50:00,I,2,24,0,50,AA,A,A,1,,140.7,1.2,2.1,0.0,59.5,0.00,0.8,20000.000000
16053,2020-04-21 11:40:00,I,4,21,11,40,AA,A,A,1,,271.7,7.0,9.2,0.0,42.8,0.32,16.7,18996.000000
16054,2020-04-21 11:50:00,I,4,21,11,50,AA,A,A,1,,276.4,6.0,9.3,0.0,43.4,0.26,19.0,18383.333333
16055,2020-04-21 12:00:00,I,4,21,12,0,AA,A,A,1,,268.4,5.9,9.5,0.0,41.7,0.48,21.1,15934.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3155927,2022-12-28 07:10:00,K,12,28,7,10,EC,E,C,1,,307.9,4.3,2.0,0.0,62.6,0.00,-1.4,17272.666667
3155962,2022-12-28 13:00:00,K,12,28,13,0,EC,E,C,1,,294.5,6.1,6.7,0.0,41.3,0.32,15.2,17744.666667
3156101,2022-12-29 12:10:00,K,12,29,12,10,EC,E,C,1,,16.3,3.5,4.7,0.0,43.2,0.34,14.6,17744.666667
3156136,2022-12-29 18:00:00,K,12,29,18,0,EC,E,C,1,,314.4,8.8,3.5,0.0,42.0,0.00,1.6,18333.333333


In [35]:
train_imputed_data = train_impute_list[0].copy()
test_imputed_data = test_impute_list[0].copy()

In [36]:
train_imputed_data

Unnamed: 0,년월일시분,년도,월,일,시간,분,지점번호,지점번호1,지점번호2,NaN유무,시정구간,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도,10분평균시정
0,2020-01-01 00:10:00,I,1,1,0,10,AA,A,A,0,4.0,0.0,0.0,-6.4,0.0,38.9,0.0,-2.8,20000.0
1,2020-01-01 00:20:00,I,1,1,0,20,AA,A,A,0,4.0,0.0,0.0,-6.3,0.0,37.9,0.0,-2.7,20000.0
2,2020-01-01 00:30:00,I,1,1,0,30,AA,A,A,0,4.0,0.0,0.0,-6.3,0.0,40.0,0.0,-2.6,20000.0
3,2020-01-01 00:40:00,I,1,1,0,40,AA,A,A,0,4.0,27.8,0.4,-6.2,0.0,39.5,0.0,-2.6,20000.0
4,2020-01-01 00:50:00,I,1,1,0,50,AA,A,A,0,4.0,59.7,0.5,-6.1,0.0,39.8,0.0,-2.5,20000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3156455,2022-12-31 23:10:00,K,12,31,23,10,EC,E,C,0,4.0,305.3,7.7,2.5,0.0,50.2,0.0,-0.9,20000.0
3156456,2022-12-31 23:20:00,K,12,31,23,20,EC,E,C,0,4.0,293.8,5.7,2.3,0.0,50.1,0.0,-1.3,20000.0
3156457,2022-12-31 23:30:00,K,12,31,23,30,EC,E,C,0,4.0,274.2,4.9,2.2,0.0,51.0,0.0,-1.4,20000.0
3156458,2022-12-31 23:40:00,K,12,31,23,40,EC,E,C,0,4.0,270.3,4.6,2.1,0.0,51.7,0.0,-1.6,20000.0


In [37]:
train

Unnamed: 0,년도,월,일,시간,분,지점번호,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도,10분평균시정,시정구간,년월일시분,지점번호1,지점번호2,NaN유무
0,I,1,1,0,10,AA,0.0,0.0,-6.4,0.0,38.9,0.0,-2.8,20000.0,4.0,2020-01-01 00:10:00,A,A,0
1,I,1,1,0,20,AA,0.0,0.0,-6.3,0.0,37.9,0.0,-2.7,20000.0,4.0,2020-01-01 00:20:00,A,A,0
2,I,1,1,0,30,AA,0.0,0.0,-6.3,0.0,40.0,0.0,-2.6,20000.0,4.0,2020-01-01 00:30:00,A,A,0
3,I,1,1,0,40,AA,27.8,0.4,-6.2,0.0,39.5,0.0,-2.6,20000.0,4.0,2020-01-01 00:40:00,A,A,0
4,I,1,1,0,50,AA,59.7,0.5,-6.1,0.0,39.8,0.0,-2.5,20000.0,4.0,2020-01-01 00:50:00,A,A,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3156455,K,12,31,23,10,EC,305.3,7.7,2.5,0.0,50.2,0.0,-0.9,20000.0,4.0,2022-12-31 23:10:00,E,C,0
3156456,K,12,31,23,20,EC,293.8,5.7,2.3,0.0,50.1,0.0,-1.3,20000.0,4.0,2022-12-31 23:20:00,E,C,0
3156457,K,12,31,23,30,EC,274.2,4.9,2.2,0.0,51.0,0.0,-1.4,20000.0,4.0,2022-12-31 23:30:00,E,C,0
3156458,K,12,31,23,40,EC,270.3,4.6,2.1,0.0,51.7,0.0,-1.6,20000.0,4.0,2022-12-31 23:40:00,E,C,0


In [38]:
test_imputed_data

Unnamed: 0,년월일시분,년도,월,일,시간,분,지점번호,지점번호1,지점번호2,NaN유무,시정구간,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도
0,2024-01-01 00:00:00,L,1,1,0,0,AI,A,I,1,,329.5,0.6,-2.5,0.0,87.5,0.0,-1.6
1,2024-01-01 00:10:00,L,1,1,0,10,AI,A,I,1,,321.8,1.2,-2.5,0.0,88.2,0.0,-1.6
2,2024-01-01 00:20:00,L,1,1,0,20,AI,A,I,1,,0.4,0.4,-2.5,0.0,88.6,0.0,-1.7
3,2024-01-01 00:30:00,L,1,1,0,30,AI,A,I,1,,323.6,0.7,-2.6,0.0,88.7,0.0,-1.6
4,2024-01-01 00:40:00,L,1,1,0,40,AI,A,I,1,,208.4,0.2,-2.6,0.0,89.0,0.0,-1.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262795,2024-12-31 23:10:00,L,12,31,23,10,ED,E,D,1,,338.8,6.1,4.5,0.0,71.3,0.0,1.9
262796,2024-12-31 23:20:00,L,12,31,23,20,ED,E,D,1,,335.2,5.8,4.6,0.0,71.4,0.0,1.9
262797,2024-12-31 23:30:00,L,12,31,23,30,ED,E,D,1,,337.1,5.3,4.5,0.0,72.1,0.0,1.8
262798,2024-12-31 23:40:00,L,12,31,23,40,ED,E,D,1,,322.0,3.7,4.3,0.0,72.5,0.0,1.4


In [39]:
test

Unnamed: 0,년도,월,일,시간,분,지점번호,10분평균풍향,10분평균풍속,10분평균기온,강수유무,10분평균상대습도,10분일사량합,10분평균지면온도,시정구간,년월일시분,지점번호1,지점번호2,NaN유무
0,L,1,1,0,0,AI,329.5,0.6,-2.5,0.0,87.5,0.0,-1.6,,2024-01-01 00:00:00,A,I,1
1,L,1,1,0,10,AI,321.8,1.2,-2.5,0.0,88.2,0.0,-1.6,,2024-01-01 00:10:00,A,I,1
2,L,1,1,0,20,AI,0.4,0.4,-2.5,0.0,88.6,0.0,-1.7,,2024-01-01 00:20:00,A,I,1
3,L,1,1,0,30,AI,323.6,0.7,-2.6,0.0,88.7,0.0,-1.6,,2024-01-01 00:30:00,A,I,1
4,L,1,1,0,40,AI,208.4,0.2,-2.6,0.0,89.0,0.0,-1.6,,2024-01-01 00:40:00,A,I,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262795,L,12,31,23,10,ED,338.8,6.1,4.5,0.0,71.3,0.0,1.9,,2024-12-31 23:10:00,E,D,1
262796,L,12,31,23,20,ED,335.2,5.8,4.6,0.0,71.4,0.0,1.9,,2024-12-31 23:20:00,E,D,1
262797,L,12,31,23,30,ED,337.1,5.3,4.5,0.0,72.1,0.0,1.8,,2024-12-31 23:30:00,E,D,1
262798,L,12,31,23,40,ED,322.0,3.7,4.3,0.0,72.5,0.0,1.4,,2024-12-31 23:40:00,E,D,1


In [28]:
def make_class(vis):
    if 0 < vis and vis < 200 :
        return 1
    elif 200 <= vis and vis < 500 :
        return 2
    elif 500 <= vis and vis < 1000 :
        return 3
    elif 1000 <= vis :
        return 4
    else :
        return np.nan

In [29]:
train_imputed_data["시정구간"] = train_imputed_data["10분평균시정"].apply(make_class)

In [30]:
train_imputed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3156460 entries, 0 to 3156459
Data columns (total 19 columns):
 #   Column     Dtype         
---  ------     -----         
 0   년월일시분      datetime64[ns]
 1   년도         object        
 2   월          int64         
 3   일          int64         
 4   시간         int64         
 5   분          int64         
 6   지점번호       object        
 7   지점번호1      object        
 8   지점번호2      object        
 9   NaN유무      int64         
 10  시정구간       int64         
 11  10분평균풍향    float64       
 12  10분평균풍속    float64       
 13  10분평균기온    float64       
 14  강수유무       float64       
 15  10분평균상대습도  float64       
 16  10분일사량합    float64       
 17  10분평균지면온도  float64       
 18  10분평균시정    float64       
dtypes: datetime64[ns](1), float64(8), int64(6), object(4)
memory usage: 457.6+ MB


In [31]:
test_imputed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262800 entries, 0 to 262799
Data columns (total 18 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   년월일시분      262800 non-null  datetime64[ns]
 1   년도         262800 non-null  object        
 2   월          262800 non-null  int64         
 3   일          262800 non-null  int64         
 4   시간         262800 non-null  int64         
 5   분          262800 non-null  int64         
 6   지점번호       262800 non-null  object        
 7   지점번호1      262800 non-null  object        
 8   지점번호2      262800 non-null  object        
 9   NaN유무      262800 non-null  int64         
 10  시정구간       0 non-null       float64       
 11  10분평균풍향    262800 non-null  float64       
 12  10분평균풍속    262800 non-null  float64       
 13  10분평균기온    262800 non-null  float64       
 14  강수유무       262800 non-null  float64       
 15  10분평균상대습도  262800 non-null  float64       
 16  10분일사량합    262800 no

In [32]:
train_imputed_data.to_csv("fog_train_knn-imputate.csv", index=False)
test_imputed_data.to_csv("fog_test_knn-imputate.csv", index=False)