# data 파일 생성

In [1]:
# install fred library
!pip install fredapi



In [1]:
# import labrary
from fredapi import Fred
import pandas as pd
import os
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import r2_score
from sklearn.linear_model import Lasso

In [2]:
# FRED API KEY
fred = Fred(api_key='63a56e3b21c9b3d23ea8c9f22fae9ddf')

# 다운로드 기간 설정
start_date = '2010-01-01'
end_date = '2021-11-30'

# 데이터 리스트 정의
fred_ids = [
    "KORCPIALLMINMEI", "KORCP010000IXOBM", "KORCP020000IXOBM", "KORCP030000IXOBM", "KORCP040000IXOBM",
    "KORCP050000IXOBM", "KORCP060000IXOBM","KORCP070000IXOBM", "KORCP080000IXOBM", "KORCP090000IXOBM",
    "KORCP100000IXOBM", "KORCP110000IXOBM", "KORCP120000IXOBM", "KORCPICORMINMEI", "KORCPIENGMINMEI",
    "KORCPGRSE01IXOBM", "KORCPGRLH02IXOBM", "KORCPGRHO02IXOBM", "KORCP040100IXOBM", "KORCP040400IXOBM",
    "KORCP040500IXOBM", "KORCP040300IXOBM"
]

# 각 리스트 에 대한 시리즈 ID와 이름 매핑
column_rename_map = {
    "KORCPIALLMINMEI": "Total CPI",
    "KORCP010000IXOBM": "Food and non-alcoholic beverages",
    "KORCP020000IXOBM": "Alcoholic beverages, tobacco and narcotics",
    "KORCP030000IXOBM": "Clothing and footwear",
    "KORCP040000IXOBM": "Housing, water, electricity, and fuel",
    "KORCP050000IXOBM": "Household goods and services",
    "KORCP060000IXOBM": "Health",
    "KORCP070000IXOBM": "Transportation",
    "KORCP080000IXOBM": "Communication",
    "KORCP090000IXOBM": "Recreation and culture",
    "KORCP100000IXOBM": "Education",
    "KORCP110000IXOBM": "Restaurants and hotels",
    "KORCP120000IXOBM": "Miscellaneous goods and services",
    "KORCPICORMINMEI": "All items (non-food non-energy)",
    "KORCPIENGMINMEI": "Energy",
    "KORCPGRSE01IXOBM": "Services",
    "KORCPGRLH02IXOBM": "Services less housing",
    "KORCPGRHO02IXOBM": "Housing excluding imputed rentals for housing",
    "KORCP040100IXOBM": "Actual rentals for housing",
    "KORCP040400IXOBM": "Water supply and misc. services relating to dwelling",
    "KORCP040500IXOBM": "Electricity, gas and other fuels",
    "KORCP040300IXOBM": "Maintenance and repair of the dwelling"
}

# 결과 데이터프레임 초기화
data = pd.DataFrame()

# 각 시리즈 불러오기 및 이름 변경
for fred_id in fred_ids:
    try:
        series_data = fred.get_series(fred_id, start_date, end_date)
        series_data = series_data.rename(column_rename_map[fred_id])
        data = pd.concat([data, series_data], axis=1)
    except Exception as e:
        print(f"Error fetching {fred_id}: {e}")

# 인덱스를 datetime 형식으로 변환
data.index = pd.to_datetime(data.index)
data.index.name = 'Date'

# 저장 폴더 생성 (data/raw)
output_dir = '../data/raw'
os.makedirs(output_dir, exist_ok=True)

# csv파일 저장
output_path = os.path.join(output_dir, 'fred_data_2010_2021.csv')
data.to_csv(output_path)

# StandardScaler()로 스케일링

In [4]:
# 1. 데이터 불러오기
df = pd.read_csv("../data/raw/fred_data_2010_2021.csv")
df["Date"] = pd.to_datetime(df["Date"])
df = df.set_index("Date")

# 2. 대상 변수 설정
target_col = "Total CPI"
X = df.drop(columns=[target_col])
y = df[target_col]

# 3. 결측치 처리 & 스케일링
X = X.fillna(method="ffill").fillna(method="bfill")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 4. LassoCV 모델 학습
lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_scaled, y)

# 5. 중요도 추출
coefs = pd.Series(lasso.coef_, index=X.columns)
non_zero_coefs = coefs[coefs != 0]

# 중요도 계산: 절댓값 비율
importance = np.abs(non_zero_coefs)
importance_pct = (importance / importance.sum()) * 100

# 6. 결과 정리 (중요도 높은 순 정렬)
result_df = pd.DataFrame({
    "feature": importance_pct.index,
    "coef": non_zero_coefs.values,
    "importance": importance_pct.values
})
result_df = result_df.sort_values(by="importance", ascending=False).reset_index(drop=True)
result_df["rank"] = result_df.index + 1
result_df = result_df[["rank", "feature", "coef", "importance"]]

# 7. 저장
output_dir = "../data/lasso_results"
os.makedirs(output_dir, exist_ok=True)
result_df.to_csv(f"{output_dir}/lasso_importance1.csv", index=False)

print(f"✅ 저장 완료: {output_dir}/lasso_importance1.csv")

✅ 저장 완료: ../data/lasso_results/lasso_importance1.csv


  X = X.fillna(method="ffill").fillna(method="bfill")


# MinMaxScaler()로 스케일링

In [5]:
# 1. 데이터 불러오기
df = pd.read_csv("../data/raw/fred_data_2010_2021.csv")
df["Date"] = pd.to_datetime(df["Date"])
df = df.set_index("Date")

# 2. 분석 대상 시기만 필터링 (2010.01 ~ 2021.11)
df = df.loc["2010-01-01":"2021-11-01"]

# 3. 타겟/입력 변수 분리
target_col = "Total CPI"
X = df.drop(columns=[target_col])
y = df[target_col]

# 4. 결측치 처리
X = X.fillna(method="ffill").fillna(method="bfill")

# 5. Min-Max 정규화 (논문 방식)
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# 6. Lasso 모델 학습 (논문엔 α 미언급 → 적절한 값 수동 설정)
lasso = Lasso(alpha=0.001, max_iter=10000)  # 필요 시 α 조정 가능
lasso.fit(X_scaled, y)

# 7. 중요도 계산
coefs = pd.Series(lasso.coef_, index=X.columns)
non_zero_coefs = coefs[coefs != 0]

importance = np.abs(non_zero_coefs)
importance_pct = (importance / importance.sum()) * 100

# 8. 결과 정리 및 저장
result_df = pd.DataFrame({
    "feature": importance_pct.index,
    "coef": non_zero_coefs.values,
    "importance": importance_pct.values
})
result_df = result_df.sort_values(by="importance", ascending=False).reset_index(drop=True)
result_df["rank"] = result_df.index + 1
result_df = result_df[["rank", "feature", "coef", "importance"]]

# 결과 출력
print("Lasso Regression Importance Results:")
print(result_df)

# 9. 저장
output_dir = "../data/lasso_results"
os.makedirs(output_dir, exist_ok=True)
result_df.to_csv(f"{output_dir}/lasso_importance2.csv", index=False)

print("✅ 논문 조건으로 Lasso 분석 완료! 👉 저장 경로:", f"{output_dir}/lasso_importance2.csv")


Lasso Regression Importance Results:
    rank                                     feature      coef  importance
0      1             All items (non-food non-energy)  5.277408   25.746460
1      2            Food and non-alcoholic beverages  5.102709   24.894170
2      3       Housing, water, electricity, and fuel  3.083291   15.042200
3      4                       Services less housing  1.707252    8.329029
4      5                              Transportation  1.641675    8.009107
5      6            Miscellaneous goods and services  0.809913    3.951258
6      7                                   Education  0.700026    3.415161
7      8                       Clothing and footwear  0.620197    3.025704
8      9  Alcoholic beverages, tobacco and narcotics  0.538276    2.626043
9     10                Household goods and services  0.451381    2.202115
10    11                                      Energy  0.330399    1.611892
11    12                      Recreation and culture  0.166620 

  X = X.fillna(method="ffill").fillna(method="bfill")


# 1차 최종 코드

In [6]:
# cpi_forecasting_pipeline.py (정규화 흐름 수정 반영)

# 📦 라이브러리 로드
import os
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

# ✅ 파라미터
PAST_STEPS = 310
FUTURE_STEPS = 31
BATCH_SIZE = 64
EPOCHS = 50

# ✅ Lasso 기반 변수 선택 & 저장
raw_df = pd.read_csv("../data/raw/fred_data_2010_2021.csv", parse_dates=["Date"]).set_index("Date")
df = raw_df.loc["2010-01-01":"2021-11-01"]
target_col = "Total CPI"
X = df.drop(columns=[target_col]).fillna(method="ffill").fillna(method="bfill")
y = df[target_col]

scaler_lasso = MinMaxScaler()
X_scaled = scaler_lasso.fit_transform(X)
lasso = Lasso(alpha=0.001, max_iter=10000)
lasso.fit(X_scaled, y)

coefs = pd.Series(lasso.coef_, index=X.columns)
non_zero = coefs[coefs != 0]
importance = np.abs(non_zero)
importance_pct = (importance / importance.sum()) * 100
selected_features = importance_pct.sort_values(ascending=False).index.tolist()

result_df = pd.DataFrame({
    "feature": importance_pct.index,
    "coef": non_zero.values,
    "importance": importance_pct.values
}).sort_values(by="importance", ascending=False).reset_index(drop=True)
result_df["rank"] = result_df.index + 1
result_df = result_df[["rank", "feature", "coef", "importance"]]

os.makedirs("../data/lasso_results", exist_ok=True)
result_df.to_csv("../data/lasso_results/lasso_importance2.csv", index=False)

# ✅ 선형 보간 및 정규화
selected_cols = ["Total CPI"] + selected_features
df = raw_df[selected_cols].copy()
df_daily = df.resample("D").interpolate(method="linear")
scaler = MinMaxScaler()
df_daily_scaled = pd.DataFrame(scaler.fit_transform(df_daily), columns=df_daily.columns, index=df_daily.index)
os.makedirs("../data/augmented", exist_ok=True)
df_daily_scaled.to_csv("../data/augmented/cpi_daily_interpolated.csv")

# ✅ 슬라이딩 윈도우 구성
data = df_daily_scaled.values
X_list, Y_list = [], []
for i in range(len(data) - PAST_STEPS - FUTURE_STEPS):
    X_list.append(data[i:i+PAST_STEPS])
    Y_list.append(data[i+PAST_STEPS:i+PAST_STEPS+FUTURE_STEPS, 0])
X_np = np.array(X_list)
Y_np = np.array(Y_list)

# ✅ Dataset & Dataloader 정의
class CPITimeSeriesDataset(torch.utils.data.Dataset):
    def __init__(self, X, Y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.Y = torch.tensor(Y, dtype=torch.float32)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.Y[idx]

dataset = CPITimeSeriesDataset(X_np, Y_np)
train_len = int(len(dataset)*0.7)
val_len = int(len(dataset)*0.15)
test_len = len(dataset) - train_len - val_len
train_set, val_set, test_set = torch.utils.data.random_split(dataset, [train_len, val_len, test_len])
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_set, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_set, batch_size=1)

# ✅ CNN-LSTM 모델 정의
class CNNLSTM(nn.Module):
    def __init__(self, input_features, past_steps=PAST_STEPS, future_steps=FUTURE_STEPS, hidden_dim=512, kernel_size=3, dropout=0.5):
        super().__init__()
        self.conv1 = nn.Conv1d(input_features, hidden_dim, kernel_size)
        self.conv2 = nn.Conv1d(hidden_dim, hidden_dim, kernel_size)
        self.pool = nn.MaxPool1d(2)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, future_steps)

    def forward(self, x):
        x = x.permute(0, 2, 1)
        x = self.pool(self.conv2(self.conv1(x)))
        x = x.permute(0, 2, 1)
        x, _ = self.lstm(x)
        x = self.dropout(x[:, -1, :])
        return self.fc(x)

# ✅ 학습 및 테스트
model = CNNLSTM(input_features=X_np.shape[2]).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
device = next(model.parameters()).device
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=5e-4)

def train(model, train_loader, val_loader, epochs):
    for epoch in range(epochs):
        model.train()
        train_losses, val_losses = [], []
        for Xb, Yb in train_loader:
            Xb, Yb = Xb.to(device), Yb.to(device)
            optimizer.zero_grad()
            loss = criterion(model(Xb), Yb)
            loss.backward()
            optimizer.step()
            train_losses.append(loss.item())
        model.eval()
        with torch.no_grad():
            for Xb, Yb in val_loader:
                Xb, Yb = Xb.to(device), Yb.to(device)
                val_losses.append(criterion(model(Xb), Yb).item())
        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {np.mean(train_losses):.4f} | Val Loss: {np.mean(val_losses):.4f}")

def test(model, test_loader):
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for Xb, Yb in test_loader:
            Xb, Yb = Xb.to(device), Yb.to(device)
            preds.append(model(Xb).cpu().numpy())
            trues.append(Yb.cpu().numpy())
    return np.concatenate(preds), np.concatenate(trues)

train(model, train_loader, val_loader, epochs=EPOCHS)
preds, trues = test(model, test_loader)

# ✅ 정규화 상태 성능 평가
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / (y_true + 1e-8))) * 100

def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-8))

def nrmse(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    return rmse / np.std(y_true)

y_true = trues.flatten()
y_pred = preds.flatten()

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
mae = mean_absolute_error(y_true, y_pred)
mape_val = mape(y_true, y_pred)
smape_val = smape(y_true, y_pred)
nrmse_val = nrmse(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
mse_val = mean_squared_error(y_true, y_pred)

print("\n📊 정규화 상태 성능 평가 결과:")
print(f"✅ RMSE   : {rmse:.4f}")
print(f"✅ MAE    : {mae:.4f}")
print(f"✅ MAPE   : {mape_val:.2f}%")
print(f"✅ SMAPE  : {smape_val:.2f}%")
print(f"✅ NRMSE  : {nrmse_val:.4f}")
print(f"✅ R²     : {r2:.4f}")
print(f"✅ MSE    : {mse_val:.6f}")

  X = df.drop(columns=[target_col]).fillna(method="ffill").fillna(method="bfill")


Epoch 1/50 | Train Loss: 0.0391 | Val Loss: 0.0017
Epoch 2/50 | Train Loss: 0.0083 | Val Loss: 0.0003
Epoch 3/50 | Train Loss: 0.0054 | Val Loss: 0.0002
Epoch 4/50 | Train Loss: 0.0046 | Val Loss: 0.0004
Epoch 5/50 | Train Loss: 0.0038 | Val Loss: 0.0002
Epoch 6/50 | Train Loss: 0.0035 | Val Loss: 0.0007
Epoch 7/50 | Train Loss: 0.0031 | Val Loss: 0.0004
Epoch 8/50 | Train Loss: 0.0027 | Val Loss: 0.0003
Epoch 9/50 | Train Loss: 0.0028 | Val Loss: 0.0004
Epoch 10/50 | Train Loss: 0.0025 | Val Loss: 0.0002
Epoch 11/50 | Train Loss: 0.0022 | Val Loss: 0.0009
Epoch 12/50 | Train Loss: 0.0024 | Val Loss: 0.0007
Epoch 13/50 | Train Loss: 0.0026 | Val Loss: 0.0008
Epoch 14/50 | Train Loss: 0.0021 | Val Loss: 0.0003
Epoch 15/50 | Train Loss: 0.0018 | Val Loss: 0.0006
Epoch 16/50 | Train Loss: 0.0019 | Val Loss: 0.0002
Epoch 17/50 | Train Loss: 0.0017 | Val Loss: 0.0001
Epoch 18/50 | Train Loss: 0.0017 | Val Loss: 0.0001
Epoch 19/50 | Train Loss: 0.0017 | Val Loss: 0.0004
Epoch 20/50 | Train L