In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
raw_df = pd.read_csv("../data/raw_heat.csv", parse_dates=["일시"])
print(raw_df.info())
raw_df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123930 entries, 0 to 123929
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   일시          123930 non-null  datetime64[ns]
 1   지점          123930 non-null  object        
 2   폭염여부(O/X)   123930 non-null  object        
 3   최고체감온도(°C)  118823 non-null  float64       
 4   최고기온(°C)    122863 non-null  float64       
 5   평균기온(°C)    122851 non-null  float64       
 6   최저기온(°C)    122858 non-null  float64       
 7   평균상대습도(%)   119752 non-null  float64       
 8   폭염특보(O/X)   123930 non-null  object        
 9   폭염영향예보(단계)  123930 non-null  object        
 10  열대야(O/X)    123930 non-null  object        
 11  자외선지수(단계)   123930 non-null  object        
dtypes: datetime64[ns](1), float64(5), object(6)
memory usage: 11.3+ MB
None


Unnamed: 0,일시,지점,폭염여부(O/X),최고체감온도(°C),최고기온(°C),평균기온(°C),최저기온(°C),평균상대습도(%),폭염특보(O/X),폭염영향예보(단계),열대야(O/X),자외선지수(단계)
0,2019-05-01,동두천(98),X,23.5,25.4,16.6,9.1,55.3,X,,X,낮음
1,2019-05-01,북춘천(93),X,22.0,25.2,16.5,7.7,56.3,X,,X,높음
2,2019-05-01,함안(920),X,24.6,25.1,16.2,10.8,76.0,X,,X,낮음


In [3]:
# 편의를 위해 컬럼명을 변수에 할당
date_info, place, is_heat, top_stmp, top_tmp, avg_tmp, bot_tmp, avg_hum, is_alarm, heat_lv, is_trop, uv_lv = raw_df.columns

In [4]:
def get_label_encoded(target_df, target_col):
    df = target_df.copy()
    le = LabelEncoder()
    df[target_col] = le.fit_transform(target_df[target_col])
    return df, le

def get_label_applied(target_df, target_col, info_dict):
    df = target_df.copy()
    df[target_col] = target_df[target_col].map(info_dict)
    return df

le_df, le = get_label_encoded(raw_df, place)

for col in is_heat, is_alarm, is_trop:
    le_df = get_label_applied(le_df, col, { "X": 0.0, "O": 1.0 })

le_df = get_label_applied(le_df, heat_lv, { " ": 1.0, "관심": 2.0, "주의": 3.0, "경고": 4.0, "심각": 5.0 })
le_df = get_label_applied(le_df, uv_lv, { "낮음": 1.0, "보통": 2.0, "높음": 3.0, "매우높음": 4.0, "위험": 5.0 })

le_df = le_df.sort_values([date_info, place]).reset_index(drop=True)
le_df.head(3)

Unnamed: 0,일시,지점,폭염여부(O/X),최고체감온도(°C),최고기온(°C),평균기온(°C),최저기온(°C),평균상대습도(%),폭염특보(O/X),폭염영향예보(단계),열대야(O/X),자외선지수(단계)
0,2019-05-01,0,0.0,,23.6,15.6,6.8,,0.0,1.0,0.0,1.0
1,2019-05-01,1,0.0,21.3,21.1,15.1,8.5,75.0,0.0,1.0,0.0,1.0
2,2019-05-01,2,0.0,21.6,20.9,14.6,10.6,67.5,0.0,1.0,0.0,1.0


In [5]:
raw_df[place].sort_values().unique()

array(['가평조종(505)', '간성(517)', '강진군(259)', '거제(294)', '거창(284)',
       '경기광주(546)', '경산(827)', '경주시(283)', '계룡(636)', '고령(812)',
       '고성(918)', '고양(540)', '고창(172)', '고흥(262)', '곡성(768)', '공주(612)',
       '과천(590)', '광명(437)', '광양읍(713)', '광주(156)', '괴산(603)', '구례(709)',
       '구리(569)', '구미(279)', '군산(140)', '군위(823)', '군포(438)', '금산(238)',
       '김제(737)', '김천(822)', '김포(441)', '김해시(253)', '나주(710)', '남양주(541)',
       '남원(247)', '남해(295)', '논산(615)', '단양(601)', '담양(706)', '당진(616)',
       '대구(143)', '대전(133)', '동두천(98)', '동해(106)', '목포(165)', '무안(699)',
       '무주(701)', '문경(273)', '밀양(288)', '보령(235)', '보성(732)', '보은(226)',
       '봉화(271)', '부산(159)', '부안(243)', '부여(236)', '부천(433)', '북강릉(104)',
       '북춘천(93)', '산청(289)', '삼척(876)', '삼천포(907)', '상주(137)', '서산(129)',
       '서울(108)', '서천(614)', '성남(572)', '성주(810)', '세종(239)', '속초(90)',
       '수원(119)', '순창군(254)', '순천시(712)', '시흥(565)', '아산(634)', '안동(136)',
       '안산(545)', '안성(516)', '안양(434)', '압해도(789)', '양구(556)'

In [6]:
def fill_na_date_base(target_df, direction="front"):
    dfs_by_date = []
    for date_value in target_df[date_info].unique():
        if direction == "front":
            df_by_date = target_df.loc[target_df[date_info] == date_value, :].ffill()
            dfs_by_date.append(df_by_date)
        elif direction == "back":
            df_by_date = target_df.loc[target_df[date_info] == date_value, :].bfill()
            dfs_by_date.append(df_by_date)
        else:
            raise Exception("Wrong direction")

    return pd.concat(dfs_by_date)

In [7]:
ffill_df = fill_na_date_base(le_df)
ffill_df.isna().sum()

일시              0
지점              0
폭염여부(O/X)       0
최고체감온도(°C)    160
최고기온(°C)        3
평균기온(°C)        4
최저기온(°C)        3
평균상대습도(%)     156
폭염특보(O/X)       0
폭염영향예보(단계)      0
열대야(O/X)        0
자외선지수(단계)       0
dtype: int64

In [8]:
result_df = fill_na_date_base(ffill_df, "back")
result_df.isna().sum()

일시            0
지점            0
폭염여부(O/X)     0
최고체감온도(°C)    0
최고기온(°C)      0
평균기온(°C)      0
최저기온(°C)      0
평균상대습도(%)     0
폭염특보(O/X)     0
폭염영향예보(단계)    0
열대야(O/X)      0
자외선지수(단계)     0
dtype: int64

In [10]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 123930 entries, 0 to 123929
Data columns (total 12 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   일시          123930 non-null  datetime64[ns]
 1   지점          123930 non-null  int64         
 2   폭염여부(O/X)   123930 non-null  float64       
 3   최고체감온도(°C)  123930 non-null  float64       
 4   최고기온(°C)    123930 non-null  float64       
 5   평균기온(°C)    123930 non-null  float64       
 6   최저기온(°C)    123930 non-null  float64       
 7   평균상대습도(%)   123930 non-null  float64       
 8   폭염특보(O/X)   123930 non-null  float64       
 9   폭염영향예보(단계)  123930 non-null  float64       
 10  열대야(O/X)    123930 non-null  float64       
 11  자외선지수(단계)   123930 non-null  float64       
dtypes: datetime64[ns](1), float64(10), int64(1)
memory usage: 12.3 MB


In [9]:
result_df.to_csv("../data/prep_heat.csv", index=False)