In [1]:
import pandas as pd
import numpy as np
import datetime
import random
import os
import sys
import holidays

import sklearn
from sklearn.metrics import mean_squared_error

from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import os

In [2]:
# 시드 고정
def seed_everything(seed: int = 2024):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything(2024)

In [3]:
train = pd.read_csv(r"C:\Users\dlwks\OneDrive\바탕 화면\VSCode\서울시 평균 기온\train.csv", parse_dates=["일시"])
submit = pd.read_csv(r"C:\Users\dlwks\OneDrive\바탕 화면\VSCode\서울시 평균 기온\sample_submission.csv")

In [4]:
train.isna().sum()

일시          0
최고기온        3
최저기온        3
일교차         4
강수량     13861
평균습도        0
평균풍속        4
일조합       118
일사합      4862
일조율       366
평균기온        0
dtype: int64

In [5]:
train = train.set_index('일시')

In [6]:
cond1 = train['최고기온'].isna()
cond2 = train['최저기온'].isna()
cond3 = train['일교차'].isna()

train[cond1|cond2|cond3]

Unnamed: 0_level_0,최고기온,최저기온,일교차,강수량,평균습도,평균풍속,일조합,일사합,일조율,평균기온
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1967-02-19,,,,,62.0,1.8,9.5,,93.1,-1.7
1973-10-16,,,,0.4,74.0,1.8,3.5,9.24,29.7,12.3
2017-10-12,,8.8,,,71.0,2.0,,2.23,0.0,11.4
2022-08-08,28.4,,,129.6,91.6,3.3,,,0.0,26.8


In [7]:
train['최고기온'] = train['최고기온'].interpolate(method = 'linear')
train['최저기온'] = train['최저기온'].interpolate(method = 'linear')

train.loc[train['일교차'].isna(), '일교차'] = train['최고기온'] - train['최저기온']

display(train[cond1|cond2|cond3])
print(train.isnull().sum())

Unnamed: 0_level_0,최고기온,최저기온,일교차,강수량,평균습도,평균풍속,일조합,일사합,일조율,평균기온
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1967-02-19,3.25,-6.15,9.4,,62.0,1.8,9.5,,93.1,-1.7
1973-10-16,17.0,5.65,11.35,0.4,74.0,1.8,3.5,9.24,29.7,12.3
2017-10-12,20.3,8.8,11.5,,71.0,2.0,,2.23,0.0,11.4
2022-08-08,28.4,24.6,3.8,129.6,91.6,3.3,,,0.0,26.8


최고기온        0
최저기온        0
일교차         0
강수량     13861
평균습도        0
평균풍속        4
일조합       118
일사합      4862
일조율       366
평균기온        0
dtype: int64


In [8]:
# 일조율 결측값 채우기
from sklearn.linear_model import LinearRegression

fill_rate_model = LinearRegression()

# 결측값이 없는 데이터로 모델을 학습
not_null_data = train[(train['일조합'].notna()) & (train['일조율'].notna())]

# 독립변수로 일조합 사용
X = not_null_data[['일조합']]
y = not_null_data['일조율']

fill_rate_model.fit(X, y)

# 일조율 결측값을 예측하여 채우기
is_null_dlfwhdbf = train['일조율'].isna()
train.loc[is_null_dlfwhdbf, '일조율'] = fill_rate_model.predict(train.loc[is_null_dlfwhdbf, ['일조합']])

In [9]:
train.isnull().sum()

최고기온        0
최저기온        0
일교차         0
강수량     13861
평균습도        0
평균풍속        4
일조합       118
일사합      4862
일조율         0
평균기온        0
dtype: int64

In [10]:
# 독립변수로 일조율 사용
X = train[['일조율']]
y = train['일조합'].dropna()

# 일조합 결측값을 채우기 위한 모델 정의
fill_sum_model = LinearRegression()
fill_sum_model.fit(X[~train['일조합'].isna()], y)

# 일조합 결측값을 예측하여 채우기
is_null_dlfwhgkq = train['일조합'].isna()
train.loc[is_null_dlfwhgkq, '일조합'] = fill_sum_model.predict(train.loc[is_null_dlfwhgkq, ['일조율']])

In [11]:
train.isnull().sum()

최고기온        0
최저기온        0
일교차         0
강수량     13861
평균습도        0
평균풍속        4
일조합         0
일사합      4862
일조율         0
평균기온        0
dtype: int64

In [12]:
# 일사합 결측값을 채우기 위한 모델 정의
fill_irradiance_model = LinearRegression()

# 결측값이 없는 데이터로 모델을 학습
not_null_irradiance_data = train[(train['일조합'].notna()) & (train['일사합'].notna())]

# 독립변수로 '일조합'을 사용
X_irradiance = not_null_irradiance_data[['일조합']]
y_irradiance = not_null_irradiance_data['일사합']
fill_irradiance_model.fit(X_irradiance, y_irradiance)

# 일사합 결측값을 예측하여 채우기
is_null_dlftkgkq = train['일사합'].isna()
train.loc[is_null_dlftkgkq, '일사합'] = fill_irradiance_model.predict(train.loc[is_null_dlftkgkq, ['일조합']])

In [13]:
train.isnull().sum()

최고기온        0
최저기온        0
일교차         0
강수량     13861
평균습도        0
평균풍속        4
일조합         0
일사합         0
일조율         0
평균기온        0
dtype: int64

In [14]:
# 일자에서 월 추출
train['월'] = train.index.month

# 각 월별 평균풍속의 중앙값 계산
median_wind_speed_per_month = train.groupby('월')['평균풍속'].median()

# 각 월별로 평균풍속 결측값을 해당 월의 중앙값으로 대체
for month, median in median_wind_speed_per_month.items():
    train.loc[(train['월'] == month) & (train['평균풍속'].isna()), '평균풍속'] = median

# 더 이상 필요하지않은 월 칼럼 삭제
train.drop('월', axis = 1, inplace = True)

train.isnull().sum()

최고기온        0
최저기온        0
일교차         0
강수량     13861
평균습도        0
평균풍속        0
일조합         0
일사합         0
일조율         0
평균기온        0
dtype: int64

In [15]:
train['강수량'].fillna(method = 'bfill', inplace = True)
train.isnull().sum()

최고기온    0
최저기온    0
일교차     0
강수량     0
평균습도    0
평균풍속    0
일조합     0
일사합     0
일조율     0
평균기온    0
dtype: int64

In [16]:
train.to_csv('C:\\Users\\dlwks\\OneDrive\\바탕 화면\\VSCode\\서울시 평균 기온\\preprocessing.csv', index = True)

train = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\서울시 평균 기온\preprocessing.csv')
train

Unnamed: 0,일시,최고기온,최저기온,일교차,강수량,평균습도,평균풍속,일조합,일사합,일조율,평균기온
0,1960-01-01,2.2,-5.2,7.4,0.4,68.3,1.7,6.7,13.038944,55.602262,-1.6
1,1960-01-02,1.2,-5.6,6.8,0.4,87.7,1.3,0.0,4.181680,2.124736,-1.9
2,1960-01-03,8.7,-2.1,10.8,0.0,81.3,3.0,0.0,4.181680,2.124736,4.0
3,1960-01-04,10.8,1.2,9.6,0.0,79.7,4.4,2.6,7.618827,22.877209,7.5
4,1960-01-05,1.3,-8.2,9.5,0.0,44.0,5.1,8.2,15.021913,67.574842,-4.6
...,...,...,...,...,...,...,...,...,...,...,...
23006,2022-12-27,3.3,-7.3,10.6,0.1,69.8,1.8,8.8,10.250000,91.700000,-2.6
23007,2022-12-28,0.1,-6.0,6.1,0.1,58.1,2.5,8.7,10.860000,90.600000,-3.3
23008,2022-12-29,2.1,-7.8,9.9,0.0,56.3,1.7,9.0,10.880000,93.800000,-2.9
23009,2022-12-30,2.3,-4.4,6.7,0.0,65.6,1.9,7.9,10.840000,82.300000,-1.8


In [17]:
train['일시'] = pd.to_datetime(train['일시'])

In [18]:
train['year'] = train['일시'].dt.year
train['month'] = train['일시'].dt.month
train['week'] = train['일시'].dt.isocalendar().week
train['day'] = train['일시'].dt.day
train['day_of_week'] = train['일시'].dt.dayofweek
train['sin_month'] = np.sin(2 * np.pi * train['month'] / 12)
train['cos_month'] = np.cos(2 * np.pi * train['month'] / 12)
train['spring'] = train['month'].apply(lambda x: 1 if x in [3, 4, 5] else 0)
train['summer'] = train['month'].apply(lambda x: 1 if x in [6, 7, 8] else 0)
train['fall'] = train['month'].apply(lambda x: 1 if x in [9, 10, 11] else 0)
train['winter'] = train['month'].apply(lambda x: 1 if x in [12, 1, 2] else 0)
train['spring_sin'] = np.sin(2 * np.pi * train['spring'] / 12)
train['spring_cos'] = np.cos(2 * np.pi * train['spring'] / 12)
train['summer_sin'] = np.sin(2 * np.pi * train['summer'] / 12)
train['summer_cos'] = np.cos(2 * np.pi * train['summer'] / 12)
train['fall_sin'] = np.sin(2 * np.pi * train['fall'] / 12)
train['fall_cos'] = np.cos(2 * np.pi * train['fall'] / 12)
train['winter_sin'] = np.sin(2 * np.pi * train['winter'] / 12)
train['winter_cos'] = np.cos(2 * np.pi * train['winter'] / 12)
train['sin_year'] = np.sin(2 * np.pi * (train['year'] - train['year'].min()))
train['cos_year'] = np.cos(2 * np.pi * (train['year'] - train['year'].min()))

train

Unnamed: 0,일시,최고기온,최저기온,일교차,강수량,평균습도,평균풍속,일조합,일사합,일조율,...,spring_sin,spring_cos,summer_sin,summer_cos,fall_sin,fall_cos,winter_sin,winter_cos,sin_year,cos_year
0,1960-01-01,2.2,-5.2,7.4,0.4,68.3,1.7,6.7,13.038944,55.602262,...,0.0,1.0,0.0,1.0,0.0,1.0,0.5,0.866025,0.000000e+00,1.0
1,1960-01-02,1.2,-5.6,6.8,0.4,87.7,1.3,0.0,4.181680,2.124736,...,0.0,1.0,0.0,1.0,0.0,1.0,0.5,0.866025,0.000000e+00,1.0
2,1960-01-03,8.7,-2.1,10.8,0.0,81.3,3.0,0.0,4.181680,2.124736,...,0.0,1.0,0.0,1.0,0.0,1.0,0.5,0.866025,0.000000e+00,1.0
3,1960-01-04,10.8,1.2,9.6,0.0,79.7,4.4,2.6,7.618827,22.877209,...,0.0,1.0,0.0,1.0,0.0,1.0,0.5,0.866025,0.000000e+00,1.0
4,1960-01-05,1.3,-8.2,9.5,0.0,44.0,5.1,8.2,15.021913,67.574842,...,0.0,1.0,0.0,1.0,0.0,1.0,0.5,0.866025,0.000000e+00,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23006,2022-12-27,3.3,-7.3,10.6,0.1,69.8,1.8,8.8,10.250000,91.700000,...,0.0,1.0,0.0,1.0,0.0,1.0,0.5,0.866025,-2.939648e-14,1.0
23007,2022-12-28,0.1,-6.0,6.1,0.1,58.1,2.5,8.7,10.860000,90.600000,...,0.0,1.0,0.0,1.0,0.0,1.0,0.5,0.866025,-2.939648e-14,1.0
23008,2022-12-29,2.1,-7.8,9.9,0.0,56.3,1.7,9.0,10.880000,93.800000,...,0.0,1.0,0.0,1.0,0.0,1.0,0.5,0.866025,-2.939648e-14,1.0
23009,2022-12-30,2.3,-4.4,6.7,0.0,65.6,1.9,7.9,10.840000,82.300000,...,0.0,1.0,0.0,1.0,0.0,1.0,0.5,0.866025,-2.939648e-14,1.0


In [19]:
season_dict = {
    '입춘': pd.to_datetime('1960-02-04'), '우수': pd.to_datetime('1960-02-18'), '경칩': pd.to_datetime('1960-03-05'),
    '춘분': pd.to_datetime('1960-03-20'), '청명': pd.to_datetime('1960-04-05'), '곡우': pd.to_datetime('1960-04-20'),
    '입하': pd.to_datetime('1960-05-05'), '소만': pd.to_datetime('1960-05-21'), '망종': pd.to_datetime('1960-06-06'),
    '하지': pd.to_datetime('1960-06-21'), '소서': pd.to_datetime('1960-07-07'), '대서': pd.to_datetime('1960-07-22'),
    '가을': pd.to_datetime('1960-08-07'), '입추': pd.to_datetime('1960-08-23'), '처서': pd.to_datetime('1960-09-07'),
    '백로': pd.to_datetime('1960-09-22'), '추분': pd.to_datetime('1960-10-08'), '한로': pd.to_datetime('1960-10-23'),
    '상강': pd.to_datetime('1960-11-07'), '입동': pd.to_datetime('1960-11-22'), '소설': pd.to_datetime('1960-12-07'),
    '대설': pd.to_datetime('1960-12-22'), '동지': pd.to_datetime('1961-01-05'), '소한': pd.to_datetime('1961-01-20')
}

train['절기'] = train['일시'].apply(lambda x: next((key for key, value in season_dict.items() if value.month == x.month and value.day == x.day), None))
train['절기'] = train['절기'].notnull().astype(int)

for season in season_dict.keys():
    train[season] = (train['일시'] == season_dict[season]).astype(int)

one_hot_encoded = pd.get_dummies(train['절기'])

train = pd.concat([train, one_hot_encoded], axis=1)

train.drop([0, 1], axis=1, inplace=True)

train

Unnamed: 0,일시,최고기온,최저기온,일교차,강수량,평균습도,평균풍속,일조합,일사합,일조율,...,처서,백로,추분,한로,상강,입동,소설,대설,동지,소한
0,1960-01-01,2.2,-5.2,7.4,0.4,68.3,1.7,6.7,13.038944,55.602262,...,0,0,0,0,0,0,0,0,0,0
1,1960-01-02,1.2,-5.6,6.8,0.4,87.7,1.3,0.0,4.181680,2.124736,...,0,0,0,0,0,0,0,0,0,0
2,1960-01-03,8.7,-2.1,10.8,0.0,81.3,3.0,0.0,4.181680,2.124736,...,0,0,0,0,0,0,0,0,0,0
3,1960-01-04,10.8,1.2,9.6,0.0,79.7,4.4,2.6,7.618827,22.877209,...,0,0,0,0,0,0,0,0,0,0
4,1960-01-05,1.3,-8.2,9.5,0.0,44.0,5.1,8.2,15.021913,67.574842,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23006,2022-12-27,3.3,-7.3,10.6,0.1,69.8,1.8,8.8,10.250000,91.700000,...,0,0,0,0,0,0,0,0,0,0
23007,2022-12-28,0.1,-6.0,6.1,0.1,58.1,2.5,8.7,10.860000,90.600000,...,0,0,0,0,0,0,0,0,0,0
23008,2022-12-29,2.1,-7.8,9.9,0.0,56.3,1.7,9.0,10.880000,93.800000,...,0,0,0,0,0,0,0,0,0,0
23009,2022-12-30,2.3,-4.4,6.7,0.0,65.6,1.9,7.9,10.840000,82.300000,...,0,0,0,0,0,0,0,0,0,0


In [None]:
X = train[['최고기온', '최저기온', '일교차', '강수량', '평균습도', '평균풍속', '일조합', '일사합', '일조율', 'year', 'month', 'week', 'day', 'day_of_week', 'sin_month',
       'cos_month', 'spring', 'summer', 'fall', 'winter', 'spring_sin','spring_cos', 'summer_sin', 'summer_cos', 'fall_sin', 'fall_cos','winter_sin', 'winter_cos', 
       '절기', '입춘', '우수', '경칩', '춘분', '청명', '곡우', '입하', '소만', '망종', '하지', '소서', '대서', '가을', '입추', '처서', '백로', '추분', '한로',
       '상강', '입동', '소설', '대설', '동지', '소한', 'sin_year', 'cos_year']]

y = train['평균기온']

kf = KFold(n_splits = 5, shuffle = True, random_state = 42)

mae_scores = []

oof_preds = np.zeros(X.shape[0])

for fold, (train_index, valid_index) in enumerate(kf.split(X)):
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

    model = CatBoostRegressor(n_estimators = 1000, random_state = 42, verbose = 0)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    oof_preds[valid_index] = y_pred

    mae = mean_absolute_error(y_valid, y_pred)
    mae_scores.append(mae)
    print(f"Fold {fold + 1}, MAE: {mae}")


In [None]:
average_mae = np.mean(mae_scores)
print(f'Average Mean Absolute Error: {average_mae}')

In [None]:
sample['평균기온'] = pd.DataFrame(oof_preds_ensemble[:358], columns=['평균기온'])

sample.to_csv(PATH.OUTPUT_DIR + '/0101-1.csv', index=False)

sample

In [None]:
# 파일 경로
train_path = r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\서울시 평균 기온\train.csv' # 학습 데이터 파일 경로
submission_path = r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\서울시 평균 기온\sample_submission.csv' # 제출 양식 파일 경로

# 데이터 불러오기
train_df = pd.read_csv(train_path)
submission_df = pd.read_csv(submission_path)

display(train_df.head())
display(submission_df.head())

In [None]:
train_df['일시'] = pd.to_datetime(train_df['일시'])
train_df = train_df.set_index('일시')

# 데이터의 시간 간격 지정
train_df.index.freq = 'D'

# 일시 컬럼이 인덱스로 할당됩니다.  
train_df.head()

In [None]:
# prophet에서 데이터를 인식하도록 일시는 ds로, target값인 평균기온은 y로 지정해줍니다.
train_df = train_df.reset_index()
train_df = train_df.rename(columns={'일시': 'ds', '평균기온': 'y'})

In [None]:
#모델 학습

from prophet import Prophet

prophet = Prophet()
prophet.fit(train_df)

In [None]:
#모델 예측
future_data = prophet.make_future_dataframe(periods = 358, freq = 'd') #periods는 예측할 기간
forecast_data = prophet.predict(future_data)
forecast_data[['ds','yhat']].tail(5)

In [None]:
submission_df['평균기온'] = forecast_data.yhat[-358:].values
submission_df