In [2]:
import numpy as np
import pandas as pd

# 데이터 준비
file_weather = 'weather.csv'
weather = pd.read_csv(file_weather)
weather = weather[:-1]
file_energy = 'energy.csv'
energy = pd.read_csv(file_energy, encoding='euc-kr')
energy['일시'] = energy['구분']

weather_df = weather[['일시', '평균기온(℃)', '평균최고기온(℃)', '최고기온(℃)', '평균최저기온(℃)', '최저기온(℃)']]
weather_df.loc[:, '일시'] = pd.to_datetime(weather_df['일시'], format='%y-%b').dt.strftime('%y.%m')
energy_df = energy[['일시', '전기(kw)', '가스(Nm3)', '수도(m3)']]
energy_df.loc[:, '일시'] = pd.to_datetime(energy_df['일시'], format='%Y-%m').dt.strftime('%y.%m')

merged_df = pd.merge(weather_df, energy_df, on='일시', how='left')
merged_df.head()

Unnamed: 0,일시,평균기온(℃),평균최고기온(℃),최고기온(℃),평균최저기온(℃),최저기온(℃),전기(kw),가스(Nm3),수도(m3)
0,20.01,1.6,5.9,11.9,-1.7,-6.5,,,
1,20.02,2.5,7.2,15.6,-1.3,-11.8,,,
2,20.03,7.7,13.3,20.6,2.6,-2.8,,,
3,20.04,11.1,16.6,23.8,6.3,1.9,,,
4,20.05,18.0,23.3,30.0,13.7,9.2,,,


In [3]:
training_df = merged_df.dropna()

In [4]:
from sklearn.model_selection import train_test_split
X = training_df[['평균기온(℃)', '평균최고기온(℃)', '최고기온(℃)', '평균최저기온(℃)', '최저기온(℃)']]
y = training_df['전기(kw)']

# 이상치 제거
from scipy import stats
z_scores = stats.zscore(X)
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
X_clean = X[filtered_entries]
y_clean = y[filtered_entries]

X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.6, random_state=42)

In [5]:
# # 데이터 스케일링
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

In [6]:
# 모델 훈련
from sklearn.metrics import mean_squared_error, r2_score

# Lasso
from sklearn.linear_model import Lasso
lasso_model = Lasso(alpha=0.1)  # alpha 값 조정 가능
lasso_model.fit(X_train, y_train)
# 모델 평가
train_acc = lasso_model.score(X_train, y_train)
print("훈련 정확도 :", train_acc)
lasso_pred = lasso_model.predict(X_test) # 모델 테스트
print(lasso_pred)
test_acc = lasso_model.score(X_test, y_test)
print("테스트 정확도 :", test_acc)
# Mean Squared Error 계산
mse = mean_squared_error(y_test, lasso_pred)
print(f"Mean Squared Error (MSE): {mse}")
# R² Score 계산
r2 = r2_score(y_test, lasso_pred)
print(f"R² Score: {r2}\n")

훈련 정확도 : 0.9998790848330932
[261125.9674446  220667.32587486 239586.80749349 211397.27827587
 275209.84874881 245822.17882773 223940.87908847 249182.21312516]
테스트 정확도 : 0.4179267670597987
Mean Squared Error (MSE): 370256017.4648875
R² Score: 0.4179267670597987



In [7]:
# NaN 값이 있는 행만 분리
missing_data = merged_df[merged_df['전기(kw)'].isna()]
X_missing = missing_data[['평균기온(℃)', '평균최고기온(℃)', '최고기온(℃)', '평균최저기온(℃)', '최저기온(℃)']]

In [10]:
# NaN 값을 채우거나 삭제하는 방식으로 데이터 준비
# NaN 값을 채우기 위해 SimpleImputer 사용
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_missing_imputed = imputer.fit_transform(X_missing)

# X_missing_imputed를 DataFrame으로 변환하고 피처 이름을 설정
X_missing_imputed_df = pd.DataFrame(X_missing_imputed, columns=['평균기온(℃)', '평균최고기온(℃)', '최고기온(℃)', '평균최저기온(℃)', '최저기온(℃)'])

# 예측 수행
predicted_values = lasso_model.predict(X_missing_imputed_df)

# 예측 결과를 원본 데이터에 업데이트
merged_df.loc[merged_df['전기(kw)'].isna(), '전기(kw)'] = predicted_values
# for i, column in enumerate(y.columns):
#     merged_df.loc[merged_df[column].isna(), column] = predicted_values[:, i]
# print(merged_df.head(),'\n', merged_df.tail(), merged_df.shape)

energy_all = merged_df[['일시', '전기(kw)']]
energy_all.loc[:, '전기(kw)'] = energy_all['전기(kw)'].astype(int)

# CSV 파일로 저장
file_merge = 'energy_all.csv'
energy_all.to_csv(file_merge, index=False, encoding='utf-8-sig')

ValueError: Must have equal len keys and value when setting with an iterable