In [5]:
import os
import re
import sys
import json
import time
import math
import requests
import pandas as pd
from glob import glob
from bs4 import BeautifulSoup
from rapidfuzz import process, fuzz
from tqdm import tqdm

# ===== 모듈 경로 추가 & 로거 설정 =====
sys.path.append(r"C:\ESG_Project1\util")
from logger import setup_logger
logger = setup_logger(__name__)

# ----------------------------------------------------
# 🔹 경로 설정
# ----------------------------------------------------
BASE_DIR = "C:/ESG_Project1/file/"
DATA_DIR = "C:/ESG_Project1/data/"
SOLAR_DIR = os.path.join(BASE_DIR, "solar_data_file")
KMA_DIR = os.path.join(BASE_DIR, "KMA_data_file")
OUT_CSV = os.path.join(BASE_DIR, "merge_data", ".csv")

WEATHER_META = os.path.join(KMA_DIR, "META_관측지점정보.csv")
CACHE_JSON = os.path.join(DATA_DIR, "json/mapping_cache.json")
REGION_FIX_JSON = os.path.join(DATA_DIR, "json/region_fix.json")
GEO_JSON = os.path.join(DATA_DIR, "json/namdong_geo.json")

TEST_YEARS = [2025]  # test용 연도 폴더 지정

# ----------------------------------------------------
# 🔹 CSV 유틸
# ----------------------------------------------------
def sniff_delimiter(path):
    with open(path,"rb") as f:
        raw=f.read(2048)
    text=raw.decode("utf-8",errors="ignore")
    return "," if text.count(",") >= text.count("\t") else "\t"

def read_csv_safe(path):
    delim = sniff_delimiter(path)
    try:
        df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
        logger.info(f"📄 '{os.path.basename(path)}' 읽기 성공 (utf-8)")
        return df
    except UnicodeDecodeError:
        df = pd.read_csv(path, encoding="cp949", delimiter=delim, index_col=False)
        logger.info(f"📄 '{os.path.basename(path)}' 읽기 성공 (cp949)")
        return df

def normalize_columns(df):
    df.columns=df.columns.str.strip()
    if "발전구분" not in df.columns:
        expected = ["발전구분","호기","일자"] + [f"{i}시 발전량(MWh)" for i in range(1,25)]
        df=df.iloc[:,:len(expected)]
        df.columns=expected
    df["발전구분"]=df["발전구분"].astype(str).str.strip()
    df["일자"]=pd.to_datetime(df["일자"],errors="coerce")
    return df

def get_hour_cols(df):
    pat = re.compile(r"^\s*(\d{1,2})시")
    return [c for c in df.columns if pat.search(c)]

# ----------------------------------------------------
# 🔹 CSV 파일 수집 (train/test 분리)
# ----------------------------------------------------
def collect_files_by_year():
    train_files, test_files = [], []
    all_csvs = glob(os.path.join(SOLAR_DIR,"20*/*.csv")) + glob(os.path.join(SOLAR_DIR,"20*/*.CSV"))
    for f in all_csvs:
        match = re.search(r"20\d{2}", f)
        year = int(match.group()) if match else None
        if year in TEST_YEARS:
            test_files.append(f)
        else:
            train_files.append(f)
    logger.info(f"🗂 train: {len(train_files)}개, test: {len(test_files)}개 CSV 파일 수집")
    return train_files, test_files

# ----------------------------------------------------
# 🔹 CSV 병합 함수
# ----------------------------------------------------
def merge_files(file_list, dataset_name="train"):
    frames=[]
    for f in tqdm(file_list, desc=f"{dataset_name} CSV 불러오기"):
        try:
            tmp = read_csv_safe(f)
            tmp = normalize_columns(tmp)
            frames.append(tmp)
        except Exception as e:
            logger.warning(f"⚠️ {os.path.basename(f)} 읽기 실패: {e}")
    if frames:
        df = pd.concat(frames, ignore_index=True).drop_duplicates()
        logger.info(f"🧹 {dataset_name} 완전 동일행 중복 제거: {len(df)}행")
        return df
    return pd.DataFrame()

def process_long(df):
    hour_cols = get_hour_cols(df)
    long_df = df.melt(id_vars=["발전구분","호기","일자"],
                      value_vars=hour_cols,
                      var_name="시간대",
                      value_name="발전량(MWh)")
    long_df["시간"]=long_df["시간대"].str.extract(r"(\d{1,2})").astype(int)
    long_df["일시"]=pd.to_datetime(long_df["일자"], errors="coerce") + pd.to_timedelta(long_df["시간"]-1, "h")
    hoqi_sum = long_df.groupby(["발전구분","일자","시간"], as_index=False)["발전량(MWh)"].sum(numeric_only=True)\
        .rename(columns={"발전량(MWh)":"합산발전량(MWh)"})
    hoqi_sum["일시"]=pd.to_datetime(hoqi_sum["일자"], errors="coerce") + pd.to_timedelta(hoqi_sum["시간"]-1,"h")
    hoqi_sum = hoqi_sum[["일시","발전구분","합산발전량(MWh)"]]
    return hoqi_sum

train_files, test_files = collect_files_by_year()
train_df = merge_files(train_files, "train")
test_df = merge_files(test_files, "test")
train_hoqi = process_long(train_df)
test_hoqi = process_long(test_df)

# ----------------------------------------------------
# 🔹 1. 남동발전 발전소 매핑 크롤러
# ----------------------------------------------------
def crawl_mapping():
    URL = "https://www.koenergy.kr/kosep/hw/fr/ov/ovhw25/main.do?menuCd=FN060202"
    logger.info(f"🌐 크롤링 시작: {URL}")
    headers = {"User-Agent": "Mozilla/5.0"}
    res = requests.get(URL, headers=headers, timeout=10)
    res.raise_for_status()
    res.encoding = "utf-8"
    soup = BeautifulSoup(res.text, "html.parser")
    tables = soup.find_all("table", class_="table_list2")
    if len(tables) < 2:
        raise RuntimeError("❌ class='table_list2' 테이블이 2개 이상 존재하지 않습니다.")
    table = tables[1]
    mapping = {}
    for tr in table.find_all("tr"):
        tds = [td.get_text(strip=True) for td in tr.find_all("td")]
        if len(tds) < 2: continue
        사업명, 지역 = tds[0], tds[1]
        if "태양광" not in 사업명: continue
        발전구분 = 사업명.replace("발전소","").replace(" ","").strip()
        mapping[발전구분] = 지역
    os.makedirs(os.path.dirname(CACHE_JSON), exist_ok=True)
    with open(CACHE_JSON,"w",encoding="utf-8") as f: json.dump(mapping,f,ensure_ascii=False,indent=2)
    logger.info(f"✅ {len(mapping)}개 항목 크롤링 완료 → {CACHE_JSON}")
    return mapping

# ----------------------------------------------------
# 🔹 2. 발전소명 → 지역 매핑 + 보정
# ----------------------------------------------------
if os.path.exists(CACHE_JSON):
    with open(CACHE_JSON,"r",encoding="utf-8") as f: mapping=json.load(f)
    logger.info(f"📦 캐시 사용 → {CACHE_JSON}")
else:
    mapping=crawl_mapping()

if not os.path.exists(REGION_FIX_JSON):
    default_region_fix = {"영흥":"인천광역시 옹진군 영흥면","삼천포":"경상남도 고성군","고성":"경상남도 고성군",
                         "예천":"경상북도 예천군","여수":"전라남도 여수시","영동":"강원특별자치도 강릉시",
                         "구미":"경상북도 구미시","장성":"전라남도 장성군","진주":"경상남도 진주시",
                         "광양":"전라남도 광양시","창원":"경상남도 창원시 마산합포구 해운동","고흥":"전라남도 고흥군",
                         "군산":"전라북도 군산시","밀양":"경상남도 밀양시","서산":"충청남도 서산시","영암":"전라남도 영암군",
                         "신안":"전라남도 신안군","강릉":"강원특별자치도 강릉시","전국":"대한민국","진주 외":"경상남도 진주시 외"}
    os.makedirs(os.path.dirname(REGION_FIX_JSON), exist_ok=True)
    with open(REGION_FIX_JSON,"w",encoding="utf-8") as f: json.dump(default_region_fix,f,ensure_ascii=False,indent=2)
    logger.info(f"🆕 지역 보정 테이블 생성 → {REGION_FIX_JSON}")

with open(REGION_FIX_JSON,"r",encoding="utf-8") as f: region_fix=json.load(f)

def normalize_name(name):
    name=re.sub(r"\s+","",str(name))
    name=name.replace("발전소","").replace("태양광","").replace("-","").replace("_","").strip()
    return name

normalized_mapping={normalize_name(k):v for k,v in mapping.items()}

def map_region(df):
    df["발전구분_정규화"]=df["발전구분"].apply(normalize_name)
    keys=list(normalized_mapping.keys())
    match_cache={}
    def fast_match(name):
        if name in match_cache: return match_cache[name]
        for key in keys:
            if key in name or name in key:
                result=normalized_mapping[key]
                if result in region_fix: result=region_fix[result]
                match_cache[name]=result
                return result
        match=process.extractOne(name, keys, scorer=fuzz.partial_ratio)
        if match and match[1]>=75:
            result=normalized_mapping[match[0]]
            if result in region_fix: result=region_fix[result]
            match_cache[name]=result
            return result
        match_cache[name]=None
        return None
    mapping_result={n:fast_match(n) for n in tqdm(df["발전구분_정규화"].unique(), desc="지역 매핑")}
    df["지역"]=df["발전구분_정규화"].map(mapping_result)
    return df

train_hoqi = map_region(train_hoqi)
test_hoqi = map_region(test_hoqi)

# ----------------------------------------------------
# 🔹 3. 카카오 API → 좌표 + 최근접 기상관측소
# ----------------------------------------------------
KAKAO_API_KEY = "93c089f75a2730af2f15c01838e892d3"
KAKAO_URL = "https://dapi.kakao.com/v2/local/search/address.json"

def get_latlng(address):
    headers = {"Authorization": f"KakaoAK {KAKAO_API_KEY}"}
    params = {"query": address}
    try:
        res = requests.get(KAKAO_URL, headers=headers, params=params, timeout=5)
        res.raise_for_status()
        docs = res.json().get("documents")
        if docs: 
            return float(docs[0]["y"]), float(docs[0]["x"])
        return None, None
    except Exception as e:
        logger.warning(f"⚠️ 주소 '{address}' 조회 실패: {e}")
        return None, None

# 기상 관측소 정보
weather_df = read_csv_safe(WEATHER_META)
weather_df = weather_df.rename(columns=str.strip)[["지점","지점명","위도","경도"]].dropna(subset=["위도","경도"])

def haversine(lat1, lon1, lat2, lon2):
    R = 6371
    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = math.sin(dlat/2)**2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dlon/2)**2
    return 2*R*math.asin(math.sqrt(a))

def find_nearest_station(lat, lon, meta_df):
    meta_df["거리(km)"] = meta_df.apply(lambda r: haversine(lat, lon, r["위도"], r["경도"]), axis=1)
    nearest = meta_df.loc[meta_df["거리(km)"].idxmin()]
    return {"지점명": nearest["지점명"], "지점번호": int(nearest["지점"]),
            "위도": nearest["위도"], "경도": nearest["경도"], "거리(km)": round(nearest["거리(km)"],2)}

# GEO_JSON 캐시 사용
if not os.path.exists(GEO_JSON):
    logger.info("🌍 좌표+최근접 관측소 매핑 시작")
    geo_cache = {}
    for region in tqdm(sorted(set(v for v in pd.concat([train_hoqi["지역"], test_hoqi["지역"]]) if v)), desc="좌표 조회"):
        lat, lng = get_latlng(region)
        if lat is None or lng is None:
            geo_cache[region] = {"위도": None, "경도": None, "최근접관측소": None}
            continue
        nearest = find_nearest_station(lat, lng, weather_df)
        geo_cache[region] = {"위도": lat, "경도": lng, "최근접관측소": nearest}
        time.sleep(0.2)
    os.makedirs(os.path.dirname(GEO_JSON), exist_ok=True)
    with open(GEO_JSON, "w", encoding="utf-8") as f:
        json.dump(geo_cache, f, ensure_ascii=False, indent=2)
else:
    with open(GEO_JSON, "r", encoding="utf-8") as f:
        geo_cache = json.load(f)
    logger.info(f"📦 좌표 캐시 사용 → {GEO_JSON}")

def get_station_num(region):
    info = geo_cache.get(region)
    if info and info.get("최근접관측소"):
        return info["최근접관측소"].get("지점번호")
    return None

train_hoqi["지점번호"] = train_hoqi["지역"].apply(get_station_num)
test_hoqi["지점번호"] = test_hoqi["지역"].apply(get_station_num)

# ----------------------------------------------------
# 🔹 4. KMA 기상 데이터 병합
# ----------------------------------------------------
weather_files = sorted(glob(os.path.join(KMA_DIR, "OBS_ASOS_TIM_*.csv")))
weather_frames = []
for wf in tqdm(weather_files, desc="기상데이터 병합"):
    try:
        tmp = read_csv_safe(wf)
        tmp["일시"] = pd.to_datetime(tmp["일시"], errors="coerce")
        tmp = tmp[["지점","일시","기온(°C)","강수량(mm)","일조(hr)","일사(MJ/m2)"]].rename(columns={"지점":"지점번호"})
        weather_frames.append(tmp)
    except Exception as e:
        logger.warning(f"⚠️ 기상파일 '{os.path.basename(wf)}' 처리 실패: {e}")
df_weather = pd.concat(weather_frames, ignore_index=True).drop_duplicates(subset=["지점번호","일시"])
logger.info(f"📊 기상 데이터 총 {len(df_weather)}행 병합 완료")

def merge_weather(df):
    final_df = pd.merge(df, df_weather, how="left", on=["지점번호","일시"])
    final_df[["합산발전량(MWh)","기온(°C)","강수량(mm)","일조(hr)","일사(MJ/m2)"]] = \
        final_df[["합산발전량(MWh)","기온(°C)","강수량(mm)","일조(hr)","일사(MJ/m2)"]].fillna(0)
    final_df = final_df[["일시","발전구분","지역","지점번호","합산발전량(MWh)","기온(°C)","강수량(mm)","일조(hr)","일사(MJ/m2)"]]
    final_df = final_df.sort_values(["지역","일시"]).reset_index(drop=True)
    return final_df

train_final = merge_weather(train_hoqi)
test_final = merge_weather(test_hoqi)

# ----------------------------------------------------
# 🔹 5. 최종 CSV 저장
# ----------------------------------------------------
os.makedirs(os.path.dirname(OUT_CSV), exist_ok=True)

train_out = OUT_CSV.replace("\.csv","/train_data.csv")
test_out = OUT_CSV.replace("\.csv","/test_data.csv")

train_final.to_csv(train_out, index=False, encoding="utf-8-sig")
logger.info(f"✅ train_data.csv 저장 → {train_out}")

test_final.to_csv(test_out, index=False, encoding="utf-8-sig")
logger.info(f"✅ test_data.csv 저장 → {test_out}")


[2025-10-22 13:23:49,053]✅ INFO - 🗂 train: 72개, test: 18개 CSV 파일 수집


train CSV 불러오기:   0%|          | 0/72 [00:00<?, ?it/s]

[2025-10-22 13:23:49,085]✅ INFO - 📄 '남동발전량_2022_01.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:49,111]✅ INFO - 📄 '남동발전량_2022_02.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:49,212]✅ INFO - 📄 '남동발전량_2022_03.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
train CSV 불러오기:   4%|▍         | 3/72 [00:00<00:03, 17.74it/s]

[2025-10-22 13:23:49,304]✅ INFO - 📄 '남동발전량_2022_04.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:49,345]✅ INFO - 📄 '남동발전량_2022_05.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
train CSV 불러오기:   7%|▋         | 5/72 [00:00<00:04, 16.38it/s]

[2025-10-22 13:23:49,399]✅ INFO - 📄 '남동발전량_2022_06.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:49,426]✅ INFO - 📄 '남동발전량_2022_07.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:49,462]✅ INFO - 📄 '남동발전량_2022_08.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
train CSV 불러오기:  11%|█         | 8/72 [00:00<00:03, 20.83it/s]

[2025-10-22 13:23:49,489]✅ INFO - 📄 '남동발전량_2022_09.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:49,511]✅ INFO - 📄 '남동발전량_2022_10.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:49,531]✅ INFO - 📄 '남동발전량_2022_11.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:49,559]✅ INFO - 📄 '남동발전량_2022_12.csv' 읽기 성공 (utf-8)


train CSV 불러오기:  17%|█▋        | 12/72 [00:00<00:02, 27.20it/s]

[2025-10-22 13:23:49,587]✅ INFO - 📄 '남동발전량_2023_01.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:49,611]✅ INFO - 📄 '남동발전량_2023_02.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:49,632]✅ INFO - 📄 '남동발전량_2023_03.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:49,653]✅ INFO - 📄 '남동발전량_2023_04.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:49,682]✅ INFO - 📄 '남동발전량_2023_05.csv' 읽기 성공 (utf-8)


train CSV 불러오기:  24%|██▎       | 17/72 [00:00<00:01, 32.21it/s]

[2025-10-22 13:23:49,719]✅ INFO - 📄 '남동발전량_2023_06.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:49,757]✅ INFO - 📄 '남동발전량_2023_07.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:49,788]✅ INFO - 📄 '남동발전량_2023_08.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:49,822]✅ INFO - 📄 '남동발전량_2023_09.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:49,859]✅ INFO - 📄 '남동발전량_2023_10.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:49,932]✅ INFO - 📄 '남동발전량_2023_11.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:49,968]✅ INFO - 📄 '남동발전량_2023_12.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:50,004]✅ INFO - 📄 '남동발전량_2024_01.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
train CSV 불러오기:  35%|███▍      | 25/72 [00:00<00:01, 27.08it/s]

[2025-10-22 13:23:50,038]✅ INFO - 📄 '남동발전량_2024_02.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:50,079]✅ INFO - 📄 '남동발전량_2024_03.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:50,106]✅ INFO - 📄 '남동발전량_2024_04.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
train CSV 불러오기:  39%|███▉      | 28/72 [00:01<00:01, 27.74it/s]

[2025-10-22 13:23:50,137]✅ INFO - 📄 '남동발전량_2024_05.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:50,169]✅ INFO - 📄 '남동발전량_2024_06.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:50,205]✅ INFO - 📄 '남동발전량_2024_07.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:50,271]✅ INFO - 📄 '남동발전량_2024_08.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:50,339]✅ INFO - 📄 '남동발전량_2024_09.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:50,377]✅ INFO - 📄 '남동발전량_2024_10.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:50,431]✅ INFO - 📄 '남동발전량_2024_11.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:50,483]✅ INFO - 📄 '남동발전량_2024_12.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:50,528]✅ INFO - 📄 '남동발전량_2022_01.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
train CSV 불러오기:  51%|█████▏    | 37/72 [00:01<00:01, 22.69it/s]

[2025-10-22 13:23:50,568]✅ INFO - 📄 '남동발전량_2022_02.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:50,602]✅ INFO - 📄 '남동발전량_2022_03.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:50,641]✅ INFO - 📄 '남동발전량_2022_04.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
train CSV 불러오기:  56%|█████▌    | 40/72 [00:01<00:01, 23.68it/s]

[2025-10-22 13:23:50,680]✅ INFO - 📄 '남동발전량_2022_05.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:50,711]✅ INFO - 📄 '남동발전량_2022_06.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:50,745]✅ INFO - 📄 '남동발전량_2022_07.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:50,777]✅ INFO - 📄 '남동발전량_2022_08.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
train CSV 불러오기:  61%|██████    | 44/72 [00:01<00:01, 25.84it/s]

[2025-10-22 13:23:50,807]✅ INFO - 📄 '남동발전량_2022_09.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:50,835]✅ INFO - 📄 '남동발전량_2022_10.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:50,859]✅ INFO - 📄 '남동발전량_2022_11.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:50,883]✅ INFO - 📄 '남동발전량_2022_12.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
train CSV 불러오기:  67%|██████▋   | 48/72 [00:01<00:00, 28.89it/s]

[2025-10-22 13:23:50,909]✅ INFO - 📄 '남동발전량_2023_01.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:50,930]✅ INFO - 📄 '남동발전량_2023_02.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:50,950]✅ INFO - 📄 '남동발전량_2023_03.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:50,973]✅ INFO - 📄 '남동발전량_2023_04.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:50,996]✅ INFO - 📄 '남동발전량_2023_05.csv' 읽기 성공 (utf-8)


train CSV 불러오기:  74%|███████▎  | 53/72 [00:01<00:00, 33.43it/s]

[2025-10-22 13:23:51,019]✅ INFO - 📄 '남동발전량_2023_06.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:51,043]✅ INFO - 📄 '남동발전량_2023_07.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:51,068]✅ INFO - 📄 '남동발전량_2023_08.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:51,097]✅ INFO - 📄 '남동발전량_2023_09.csv' 읽기 성공 (utf-8)


train CSV 불러오기:  79%|███████▉  | 57/72 [00:02<00:00, 35.12it/s]

[2025-10-22 13:23:51,122]✅ INFO - 📄 '남동발전량_2023_10.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:51,146]✅ INFO - 📄 '남동발전량_2023_11.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:51,167]✅ INFO - 📄 '남동발전량_2023_12.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:51,188]✅ INFO - 📄 '남동발전량_2024_01.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:51,209]✅ INFO - 📄 '남동발전량_2024_02.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
train CSV 불러오기:  86%|████████▌ | 62/72 [00:02<00:00, 37.79it/s]

[2025-10-22 13:23:51,235]✅ INFO - 📄 '남동발전량_2024_03.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:51,258]✅ INFO - 📄 '남동발전량_2024_04.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:51,283]✅ INFO - 📄 '남동발전량_2024_05.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:51,306]✅ INFO - 📄 '남동발전량_2024_06.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:51,328]✅ INFO - 📄 '남동발전량_2024_07.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
train CSV 불러오기:  93%|█████████▎| 67/72 [00:02<00:00, 39.17it/s]

[2025-10-22 13:23:51,348]✅ INFO - 📄 '남동발전량_2024_08.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:51,372]✅ INFO - 📄 '남동발전량_2024_09.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:51,392]✅ INFO - 📄 '남동발전량_2024_10.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:51,414]✅ INFO - 📄 '남동발전량_2024_11.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:51,436]✅ INFO - 📄 '남동발전량_2024_12.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
train CSV 불러오기: 100%|██████████| 72/72 [00:02<00:00, 30.17it/s]


[2025-10-22 13:23:51,619]✅ INFO - 🧹 train 완전 동일행 중복 제거: 24450행


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:51,644]✅ INFO - 📄 '남동발전량_2025_01.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:51,664]✅ INFO - 📄 '남동발전량_2025_02.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:51,684]✅ INFO - 📄 '남동발전량_2025_03.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:51,706]✅ INFO - 📄 '남동발전량_2025_04.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:51,726]✅ INFO - 📄 '남동발전량_2025_05.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
test CSV 불러오기:  28%|██▊       | 5/18 [00:00<00:00, 47.85it/s]

[2025-10-22 13:23:51,749]✅ INFO - 📄 '남동발전량_2025_06.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:51,774]✅ INFO - 📄 '남동발전량_2025_07.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:51,795]✅ INFO - 📄 '남동발전량_2025_08.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:51,813]✅ INFO - 📄 '남동발전량_2025_09.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:51,837]✅ INFO - 📄 '남동발전량_2025_01.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
test CSV 불러오기:  56%|█████▌    | 10/18 [00:00<00:00, 45.91it/s]

[2025-10-22 13:23:51,860]✅ INFO - 📄 '남동발전량_2025_02.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:51,881]✅ INFO - 📄 '남동발전량_2025_03.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:51,905]✅ INFO - 📄 '남동발전량_2025_04.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:51,927]✅ INFO - 📄 '남동발전량_2025_05.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:51,948]✅ INFO - 📄 '남동발전량_2025_06.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
test CSV 불러오기:  83%|████████▎ | 15/18 [00:00<00:00, 45.49it/s]

[2025-10-22 13:23:51,975]✅ INFO - 📄 '남동발전량_2025_07.csv' 읽기 성공 (utf-8)
[2025-10-22 13:23:52,000]✅ INFO - 📄 '남동발전량_2025_08.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)


[2025-10-22 13:23:52,024]✅ INFO - 📄 '남동발전량_2025_09.csv' 읽기 성공 (utf-8)


  df = pd.read_csv(path, encoding="utf-8", delimiter=delim, index_col=False)
test CSV 불러오기: 100%|██████████| 18/18 [00:00<00:00, 44.49it/s]


[2025-10-22 13:23:52,095]✅ INFO - 🧹 test 완전 동일행 중복 제거: 6279행
[2025-10-22 13:23:55,119]✅ INFO - 📦 캐시 사용 → C:/ESG_Project1/data/json/mapping_cache.json


지역 매핑: 100%|██████████| 15/15 [00:00<00:00, 29440.60it/s]
지역 매핑: 100%|██████████| 15/15 [00:00<?, ?it/s]

[2025-10-22 13:23:56,800]✅ INFO - 📄 'META_관측지점정보.csv' 읽기 성공 (cp949)
[2025-10-22 13:23:56,809]✅ INFO - 📦 좌표 캐시 사용 → C:/ESG_Project1/data/json/namdong_geo.json



기상데이터 병합:   0%|          | 0/4 [00:00<?, ?it/s]

[2025-10-22 13:23:58,133]✅ INFO - 📄 'OBS_ASOS_TIM_2022.csv' 읽기 성공 (cp949)


기상데이터 병합:  25%|██▌       | 1/4 [00:01<00:05,  1.98s/it]

[2025-10-22 13:24:00,507]✅ INFO - 📄 'OBS_ASOS_TIM_2023.csv' 읽기 성공 (cp949)


기상데이터 병합:  50%|█████     | 2/4 [00:04<00:04,  2.19s/it]

[2025-10-22 13:24:02,949]✅ INFO - 📄 'OBS_ASOS_TIM_2024.csv' 읽기 성공 (cp949)


기상데이터 병합:  75%|███████▌  | 3/4 [00:06<00:02,  2.25s/it]

[2025-10-22 13:24:04,774]✅ INFO - 📄 'OBS_ASOS_TIM_2025.csv' 읽기 성공 (cp949)


기상데이터 병합: 100%|██████████| 4/4 [00:08<00:00,  2.05s/it]


[2025-10-22 13:24:06,139]✅ INFO - 📊 기상 데이터 총 3163359행 병합 완료
[2025-10-22 13:24:13,430]✅ INFO - ✅ train_data.csv 저장 → C:/ESG_Project1/file/merge_data/train_data.csv
[2025-10-22 13:24:14,805]✅ INFO - ✅ test_data.csv 저장 → C:/ESG_Project1/file/merge_data/test_data.csv
