# 공모전 과제 : 계절별 지면온도 예측 모델 생성
* 데이터를 계절별로 분할한 다음 전처리 및 모델링 작업 수행하기로 결정
* mmddhh 컬럼을 기준으로 봄(2-4), 여름(5-7), 가을(8-10), 겨울(11-1) 순으로 분할
* 계절별 분할 전, 6/19에 반영된 공지사항 반영 및 일부 결측치 행 제거

In [1]:
import pandas as pd
import numpy as np

In [2]:
# surface_tp_train.csv 및 surface_tp_test.csv 파일을 로드하고, 편의를 위해 컬럼명을 변경함

train = pd.read_csv('/content/drive/MyDrive/competition/temp_prediction/surface_tp_train.csv')
test = pd.read_csv('/content/drive/MyDrive/competition/temp_prediction/surface_tp_test.csv')

change_tr = {'surface_tp_train.stn' : 'stn', 'surface_tp_train.year' : 'year', 'surface_tp_train.mmddhh' : 'mmddhh', 'surface_tp_train.ta' : 'ta',
          'surface_tp_train.td' : 'td', 'surface_tp_train.hm' : 'hm', 'surface_tp_train.ws' : 'ws', 'surface_tp_train.rn' : 'rn',
          'surface_tp_train.re' : 're', 'surface_tp_train.ww' : 'ww', 'surface_tp_train.ts' : 'ts', 'surface_tp_train.si' : 'si',
          'surface_tp_train.ss' : 'ss', 'surface_tp_train.sn' : 'sn'}

change_te = {'surface_tp_test.stn' : 'stn', 'surface_tp_test.year' : 'year', 'surface_tp_test.mmddhh' : 'mmddhh', 'surface_tp_test.ta' : 'ta',
          'surface_tp_test.td' : 'td', 'surface_tp_test.hm' : 'hm', 'surface_tp_test.ws' : 'ws', 'surface_tp_test.rn' : 'rn',
          'surface_tp_test.re' : 're', 'surface_tp_test.ww' : 'ww', 'surface_tp_test.si' : 'si', 'surface_tp_test.ss' : 'ss', 'surface_tp_test.sn' : 'sn'}
train.rename(columns=change_tr, inplace=True)
test.rename(columns=change_te, inplace=True)

print(train.shape)
print(test.shape)
train.head()

(438240, 15)
(26280, 14)


Unnamed: 0.1,Unnamed: 0,stn,year,mmddhh,ta,td,hm,ws,rn,re,ww,ts,si,ss,sn
0,1,1,A,20100,-9.9,-10.7,93.9,0.6,0.0,0,H,-1.3,-99.9,-99.9,-99.9
1,2,1,A,20101,-10.8,-11.6,93.8,0.6,0.0,0,H,-1.5,-99.9,-99.9,-99.9
2,3,1,A,20102,-11.4,-12.1,94.6,0.7,0.0,0,H,-1.7,-99.9,-99.9,-99.9
3,4,1,A,20103,-11.6,-12.5,93.4,0.6,0.0,0,H,-1.8,-99.9,-99.9,-99.9
4,5,1,A,20104,-11.8,-12.7,93.0,0.6,0.0,0,H,-2.0,-99.9,-99.9,-99.9


In [3]:
test.head()

Unnamed: 0.1,Unnamed: 0,stn,year,mmddhh,ta,td,hm,ws,rn,re,ww,si,ss,sn
0,1,a,F,20100,0.6,-2.0,82.5,2.7,0.0,0,G,-99.9,-99.9,3.1
1,2,a,F,20101,0.0,-5.2,68.3,3.2,0.0,0,R,-99.9,-99.9,3.1
2,3,a,F,20102,-0.3,-6.4,63.7,2.7,0.0,0,C,-99.9,-99.9,3.1
3,4,a,F,20103,-1.0,-4.5,77.2,2.1,0.2,7,R,-99.9,-99.9,4.1
4,5,a,F,20104,-1.4,-3.1,88.3,2.9,0.6,3,R,-99.9,-99.9,4.7


In [4]:
# 6/19 공지 반영하여 rn(누적강수량)값을 변경
# (1.8, 3.3, 5.3, 19.5, 30.3, 623.5) --> -99.9

rn_change = [1.8, 3.3, 5.3, 19.5, 30.3, 623.5]

for rn in rn_change:
  train.loc[train['rn'] == rn, 'rn'] = -99.9
  test.loc[test['rn'] == rn, 'rn'] = -99.9

In [5]:
# 결측치 처리 --> 정답 컬럼인 지면온도를 제외하고 나머지 컬럼이 결측치(-99.9 or -99)인 경우 해당 데이터 제거
# 해당 작업은 훈련 데이터셋에만 적용

drop_index = train[(train['ta'] == -99.9) & (train['td'] == -99.9) & (train['hm'] == -99.9) & (train['ws'] == -99.9)
& (train['rn'] == -99.9) & (train['re'] == -99) & (train['si'] == -99.90) & (train['ss'] == -99.9)
& (train['sn'] == -99.9)].index

print(f'제거한 이상치 행 개수 : {len(drop_index)}')
train.drop(index=drop_index, inplace=True)
train.reset_index(drop=True, inplace=True)
train.head()

제거한 이상치 행 개수 : 197


Unnamed: 0.1,Unnamed: 0,stn,year,mmddhh,ta,td,hm,ws,rn,re,ww,ts,si,ss,sn
0,1,1,A,20100,-9.9,-10.7,93.9,0.6,0.0,0,H,-1.3,-99.9,-99.9,-99.9
1,2,1,A,20101,-10.8,-11.6,93.8,0.6,0.0,0,H,-1.5,-99.9,-99.9,-99.9
2,3,1,A,20102,-11.4,-12.1,94.6,0.7,0.0,0,H,-1.7,-99.9,-99.9,-99.9
3,4,1,A,20103,-11.6,-12.5,93.4,0.6,0.0,0,H,-1.8,-99.9,-99.9,-99.9
4,5,1,A,20104,-11.8,-12.7,93.0,0.6,0.0,0,H,-2.0,-99.9,-99.9,-99.9


In [6]:
# spring/summer/autumn/winter split
# spring : 20100 ~ 43023
# summer : 50100 ~ 73123
# autumn : 80100 ~ 103123
# winter : 110100 ~ 13123

train_spr = train[(train['mmddhh'] >= 20100) & (train['mmddhh'] <= 43023)]
train_sum = train[(train['mmddhh'] >= 50100) & (train['mmddhh'] <= 73123)]
train_aut = train[(train['mmddhh'] >= 80100) & (train['mmddhh'] <= 103123)]
train_win = train[(train['mmddhh'] >= 110100) | (train['mmddhh'] <= 13123)]

test_spr = test[(test['mmddhh'] >= 20100) & (test['mmddhh'] <= 43023)]
test_sum = test[(test['mmddhh'] >= 50100) & (test['mmddhh'] <= 73123)]
test_aut = test[(test['mmddhh'] >= 80100) & (test['mmddhh'] <= 103123)]
test_win = test[(test['mmddhh'] >= 110100) | (test['mmddhh'] <= 13123)]

print(train_spr.shape)
print(train_sum.shape)
print(train_aut.shape)
print(train_win.shape)

print(test_spr.shape)
print(test_sum.shape)
print(test_aut.shape)
print(test_win.shape)

(106999, 15)
(110342, 15)
(110328, 15)
(110374, 15)
(6408, 14)
(6624, 14)
(6624, 14)
(6624, 14)


In [7]:
# 분할 데이터셋 저장

train_spr.to_csv('train_spring.csv', index=False)
train_sum.to_csv('train_summer.csv', index=False)
train_aut.to_csv('train_autumn.csv', index=False)
train_win.to_csv('train_winter.csv', index=False)

test_spr.to_csv('test_spring.csv', index=False)
test_sum.to_csv('test_summer.csv', index=False)
test_aut.to_csv('test_autumn.csv', index=False)
test_win.to_csv('test_winter.csv', index=False)