In [55]:
from IPython.display import display, HTML
display(HTML("""
<style>
div.container{width:86% !important;}
div.cell.code_cell.rendered{width:100%;}
div.CodeMirror {font-family:Consolas; font-size:12pt;}
div.output {font-size:15pt; font-weight:bold;}
div.input {font-family:Consolas; font-size:12pt;}
div.prompt {min-width:70px;}
div#toc-wrapper{padding-top:120px;}
div.text_cell_render ul li{font-size:12pt;padding:5px;}
table.dataframe{font-size:15px;}
</style>
"""))

# 1. 기상 데이터 로드

In [2]:
# 필요 라이브러리
import pandas as pd

In [4]:
data = pd.read_csv('data/22년부터일일단위기상.csv', encoding='cp949')
data

Unnamed: 0,지점,지점명,일시,평균기온(°C),일강수량(mm),평균 풍속(m/s),평균 상대습도(%)
0,108,서울,2022-01-01,-4.3,,1.5,46.3
1,108,서울,2022-01-02,-1.3,0.3,2.3,57.9
2,108,서울,2022-01-03,-1.9,0.0,1.8,58.3
3,108,서울,2022-01-04,-2.5,0.0,2.4,51.8
4,108,서울,2022-01-05,-2.8,,1.7,47.6
...,...,...,...,...,...,...,...
1260,108,서울,2025-06-14,25.6,,2.0,66.3
1261,108,서울,2025-06-15,26.3,11.8,2.5,69.6
1262,108,서울,2025-06-16,23.0,20.0,2.7,87.8
1263,108,서울,2025-06-17,24.8,,2.5,73.3


In [6]:
# 결측치 체크
data.isna().sum() # 강수량에서 결측치 => 비가 오지 않으면 원본데이터 상에서 '-' 처리 확인 -> 0으로 대체필요

지점              0
지점명             0
일시              0
평균기온(°C)        0
일강수량(mm)      743
평균 풍속(m/s)      5
평균 상대습도(%)      0
dtype: int64

In [8]:
data.fillna(0,inplace=True)
data.isna().sum()

# 2. 기존 서울 데이터 -> 서울 25개 행정구로 분할 필요

In [12]:
gu_info = ['강남구', '강동구', '강북구', '강서구', '관악구', '광진구', '구로구', '금천구', '노원구', '도봉구','동대문구', '동작구', '마포구', '서대문구', '서초구', '성동구', '성북구', '송파구', '양천구', '영등포구', '용산구', '은평구', '종로구', '중구', '중랑구']
len(gu_info) # 25

25

In [14]:
# 원본 데이터프레임: df
# 행정구 데이터프레임 생성
gu_df = pd.DataFrame({'행정구': gu_info})

# 모든 조합 생성 (cross join)
data['key'] = 1
gu_df['key'] = 1
expanded_df = pd.merge(data, gu_df, on='key').drop('key', axis=1)
expanded_df

Unnamed: 0,지점,지점명,일시,평균기온(°C),일강수량(mm),평균 풍속(m/s),평균 상대습도(%),행정구
0,108,서울,2022-01-01,-4.3,0.0,1.5,46.3,강남구
1,108,서울,2022-01-01,-4.3,0.0,1.5,46.3,강동구
2,108,서울,2022-01-01,-4.3,0.0,1.5,46.3,강북구
3,108,서울,2022-01-01,-4.3,0.0,1.5,46.3,강서구
4,108,서울,2022-01-01,-4.3,0.0,1.5,46.3,관악구
...,...,...,...,...,...,...,...,...
31620,108,서울,2025-06-18,25.5,0.0,2.0,65.3,용산구
31621,108,서울,2025-06-18,25.5,0.0,2.0,65.3,은평구
31622,108,서울,2025-06-18,25.5,0.0,2.0,65.3,종로구
31623,108,서울,2025-06-18,25.5,0.0,2.0,65.3,중구


In [16]:
expanded_df.isna().sum()

지점            0
지점명           0
일시            0
평균기온(°C)      0
일강수량(mm)      0
평균 풍속(m/s)    0
평균 상대습도(%)    0
행정구           0
dtype: int64

In [17]:
expanded_df.drop(['지점','지점명'],axis=1,inplace=True)

In [18]:
expanded_df

Unnamed: 0,일시,평균기온(°C),일강수량(mm),평균 풍속(m/s),평균 상대습도(%),행정구
0,2022-01-01,-4.3,0.0,1.5,46.3,강남구
1,2022-01-01,-4.3,0.0,1.5,46.3,강동구
2,2022-01-01,-4.3,0.0,1.5,46.3,강북구
3,2022-01-01,-4.3,0.0,1.5,46.3,강서구
4,2022-01-01,-4.3,0.0,1.5,46.3,관악구
...,...,...,...,...,...,...
31620,2025-06-18,25.5,0.0,2.0,65.3,용산구
31621,2025-06-18,25.5,0.0,2.0,65.3,은평구
31622,2025-06-18,25.5,0.0,2.0,65.3,종로구
31623,2025-06-18,25.5,0.0,2.0,65.3,중구


# 3. 향후 데이터 처리의 용이성을 위해 일시를 연_월_시로 나누어 각각 저장

In [23]:
expanded_df.dtypes

일시             object
평균기온(°C)      float64
일강수량(mm)      float64
평균 풍속(m/s)    float64
평균 상대습도(%)    float64
행정구            object
dtype: object

In [26]:
expanded_df['일시'] = expanded_df['일시'].astype('datetime64')

In [27]:
expanded_df

Unnamed: 0,일시,평균기온(°C),일강수량(mm),평균 풍속(m/s),평균 상대습도(%),행정구
0,2022-01-01,-4.3,0.0,1.5,46.3,강남구
1,2022-01-01,-4.3,0.0,1.5,46.3,강동구
2,2022-01-01,-4.3,0.0,1.5,46.3,강북구
3,2022-01-01,-4.3,0.0,1.5,46.3,강서구
4,2022-01-01,-4.3,0.0,1.5,46.3,관악구
...,...,...,...,...,...,...
31620,2025-06-18,25.5,0.0,2.0,65.3,용산구
31621,2025-06-18,25.5,0.0,2.0,65.3,은평구
31622,2025-06-18,25.5,0.0,2.0,65.3,종로구
31623,2025-06-18,25.5,0.0,2.0,65.3,중구


In [48]:
def devide_date(row) :
    '''일시를 연월일로 분리하여 저장'''
    row = row.copy()
    row['연'] = row['일시'].year
    row['월'] = row['일시'].month
    row['일'] = row['일시'].day
    row['요일'] = row['일시'].weekday()
    if row['일시'].weekday() < 5 :
        row['평일1주말0'] = 1
    else :
        row['평일1주말0'] = 0
    return row

In [49]:
devide_date(expanded_df.iloc[0])

일시            2022-01-01 00:00:00
평균기온(°C)                     -4.3
일강수량(mm)                      0.0
평균 풍속(m/s)                    1.5
평균 상대습도(%)                   46.3
행정구                           강남구
연                            2022
월                               1
일                               1
요일                              5
평일1주말0                          0
Name: 0, dtype: object

In [50]:
ex_df = expanded_df.apply(devide_date,axis=1)
ex_df

Unnamed: 0,일시,평균기온(°C),일강수량(mm),평균 풍속(m/s),평균 상대습도(%),행정구,연,월,일,요일,평일1주말0
0,2022-01-01,-4.3,0.0,1.5,46.3,강남구,2022,1,1,5,0
1,2022-01-01,-4.3,0.0,1.5,46.3,강동구,2022,1,1,5,0
2,2022-01-01,-4.3,0.0,1.5,46.3,강북구,2022,1,1,5,0
3,2022-01-01,-4.3,0.0,1.5,46.3,강서구,2022,1,1,5,0
4,2022-01-01,-4.3,0.0,1.5,46.3,관악구,2022,1,1,5,0
...,...,...,...,...,...,...,...,...,...,...,...
31620,2025-06-18,25.5,0.0,2.0,65.3,용산구,2025,6,18,2,1
31621,2025-06-18,25.5,0.0,2.0,65.3,은평구,2025,6,18,2,1
31622,2025-06-18,25.5,0.0,2.0,65.3,종로구,2025,6,18,2,1
31623,2025-06-18,25.5,0.0,2.0,65.3,중구,2025,6,18,2,1


In [51]:
ex_df2022 = ex_df[ex_df['연']==2022]
ex_df2023 = ex_df[ex_df['연']==2023]
ex_df2024 = ex_df[ex_df['연']==2024]
ex_df2025 = ex_df[ex_df['연']==2025]

# 4. 전처리데이터저장

In [54]:
ex_df2022.to_csv('model_1/2022_행정구별기상.csv',index=False)
ex_df2023.to_csv('model_1/2023_행정구별기상.csv',index=False)
ex_df2024.to_csv('model_1/2024_행정구별기상.csv',index=False)
ex_df2025.to_csv('model_1/2025_행정구별기상.csv',index=False)