In [76]:
import pandas as pd
from sqlalchemy import create_engine
import json

# 접속 정보 로딩 (옵션: db-config.json 파일이 있을 경우)
with open('db-config.json') as f:
    config = json.load(f)

user = config['user']
password = config['password']
host = config['host']
port = config['port']
database = config['database']

# SQLAlchemy 엔진 생성
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}?charset=utf8mb4")

# air_quality 테이블 데이터 읽어오기
query = "SELECT * FROM air_quality"
air_quality = pd.read_sql(query, con=engine)

# ▶ 결과 확인
air_quality.head()

Unnamed: 0,id,datetime,region,pm10,pm25
0,1,2015-12-31 23:00:00,평균,93.0,62.0
1,2,2015-12-31 23:00:00,강남구,107.0,70.0
2,3,2015-12-31 23:00:00,강동구,91.0,51.0
3,4,2015-12-31 23:00:00,강북구,92.0,64.0
4,5,2015-12-31 23:00:00,강서구,96.0,51.0


In [77]:
air_quality.isnull().sum()

id              0
datetime        0
region          0
pm10        73631
pm25        70223
dtype: int64

In [78]:
#월별 & 지역별 평균으로 결측치 대체

air_quality['datetime'] = pd.to_datetime(air_quality['datetime'])

air_quality['year_month'] = air_quality['datetime'].dt.to_period('M')

cols_to_fill = ['pm10', 'pm25']

for col in cols_to_fill:
    air_quality[col] = air_quality.groupby(['year_month', 'region'])[col].transform(
        lambda x: x.fillna(x.mean())
    )

In [79]:
air_quality.isnull().sum()

id              0
datetime        0
region          0
pm10          720
pm25          720
year_month      0
dtype: int64

In [80]:
# 연월 평균으로 결측치 보완

for col in ['pm10', 'pm25']:
    air_quality[col] = air_quality.groupby('year_month')[col].transform(
        lambda x: x.fillna(x.mean())
    )

In [81]:
air_quality.isnull().sum()

id            0
datetime      0
region        0
pm10          0
pm25          0
year_month    0
dtype: int64

In [82]:
air_quality.drop(columns='id', inplace=True)

In [83]:
air_quality.drop(columns='year_month', inplace=True)

In [84]:
air_quality['pm10'] = air_quality['pm10'].astype(int)
air_quality['pm25'] = air_quality['pm25'].astype(int)

In [85]:
# 평균 행 제거
air_quality = air_quality[air_quality['region'] != '평균'].reset_index(drop=True)

In [86]:
air_quality.head()

Unnamed: 0,datetime,region,pm10,pm25
0,2015-12-31 23:00:00,강남구,107,70
1,2015-12-31 23:00:00,강동구,91,51
2,2015-12-31 23:00:00,강북구,92,64
3,2015-12-31 23:00:00,강서구,96,51
4,2015-12-31 23:00:00,관악구,98,68


In [87]:
def get_missing_datetimes_by_region(data, datetime_col='datetime', region_col='region', freq='h'):
    data = data.copy()
    data[datetime_col] = pd.to_datetime(data[datetime_col])

    # 전체 시간 범위 기준
    full_range = pd.date_range(start=data[datetime_col].min(), end=data[datetime_col].max(), freq=freq)

    all_missing = []

    for region in data[region_col].unique():
        region_df = data[data[region_col] == region]
        region_times = pd.to_datetime(region_df[datetime_col].drop_duplicates())
        missing_times = full_range.difference(region_times)

        for ts in missing_times:
            all_missing.append({'region': region, 'datetime': ts})

    missing_df = pd.DataFrame(all_missing)
    print(f"📌 총 누락된 시간 수: {len(missing_df):,}개")
    return missing_df

In [88]:
missing_df = get_missing_datetimes_by_region(air_quality)
print(missing_df)

📌 총 누락된 시간 수: 4,448개
     region            datetime
0       강남구 2012-01-16 12:00:00
1       강남구 2012-01-16 13:00:00
2       강남구 2012-01-16 14:00:00
3       강남구 2012-01-16 15:00:00
4       강남구 2012-01-16 16:00:00
...     ...                 ...
4443    중랑구 2015-11-16 13:00:00
4444    중랑구 2015-11-16 14:00:00
4445    중랑구 2015-11-16 15:00:00
4446    중랑구 2015-11-16 16:00:00
4447    중랑구 2015-12-18 20:00:00

[4448 rows x 2 columns]


In [89]:
# 원본 데이터에서 '월' 컬럼 추가
air_quality['datetime'] = pd.to_datetime(air_quality['datetime'])
air_quality['month'] = air_quality['datetime'].dt.to_period('M')

# 지역별-월별 평균 계산
monthly_avg = air_quality.groupby(['region', 'month'])[['pm10', 'pm25']].mean().reset_index()

In [90]:
# 누락 시간에 'month' 정보 붙이기
missing_df['month'] = missing_df['datetime'].dt.to_period('M')

# 월별 평균과 병합 (pm10, pm25 채워넣기)
missing_filled = pd.merge(
    missing_df,
    monthly_avg,
    how='left',
    on=['region', 'month']
)

In [91]:
# datetime 정렬
air_quality['datetime'] = pd.to_datetime(air_quality['datetime'])
missing_filled['datetime'] = pd.to_datetime(missing_filled['datetime'])

# 누락된 row에는 'month', 'pm10', 'pm25'가 채워져 있음
data_filled = pd.concat([
    air_quality[['datetime', 'region', 'pm10', 'pm25']],
    missing_filled[['datetime', 'region', 'pm10', 'pm25']]
], ignore_index=True)

# 정렬
data_filled = data_filled.sort_values(by=['region', 'datetime']).reset_index(drop=True)

In [92]:
missing_df2 = get_missing_datetimes_by_region(data_filled)
print(missing_df2)

📌 총 누락된 시간 수: 0개
Empty DataFrame
Columns: []
Index: []


In [93]:
air_quality = data_filled.copy()

In [94]:
# weather 테이블 데이터 읽어오기
query2 = "SELECT * FROM weather"
weather = pd.read_sql(query2, con=engine)

# ▶ 결과 확인
weather.head()

Unnamed: 0,id,station_id,station_name,datetime,temperature,wind_direction,wind_speed,precipitation,humidity
0,1,400,강남,2015-01-01 01:00:00,-6.3,314.7,2.7,0.0,30.0
1,2,400,강남,2015-01-01 02:00:00,-6.9,295.6,3.0,0.0,33.0
2,3,400,강남,2015-01-01 03:00:00,-7.3,334.4,2.7,0.0,34.0
3,4,400,강남,2015-01-01 04:00:00,-7.7,320.1,2.9,0.0,33.0
4,5,400,강남,2015-01-01 05:00:00,-7.9,294.6,2.3,0.0,33.0


In [95]:
weather.drop(['station_id'], axis=1, inplace=True)
weather.rename(columns={"station_name": "region"}, inplace=True)

In [96]:
weather.isnull().sum()

id                     0
region                 0
datetime               0
temperature         5696
wind_direction      6460
wind_speed          5936
precipitation      17412
humidity          633811
dtype: int64

In [97]:
#월별 & 지역별 평균으로 결측치 대체

weather['datetime'] = pd.to_datetime(weather['datetime'])

weather['year_month'] = weather['datetime'].dt.to_period('M')

cols_to_fill = ['temperature', 'wind_direction', 'wind_speed', 'precipitation', 'humidity']

for col in cols_to_fill:
    weather[col] = weather.groupby(['year_month', 'region'])[col].transform(
        lambda x: x.fillna(round(x.mean(), 1))
    )

In [98]:
weather.isnull().sum()

id                     0
region                 0
datetime               0
temperature           15
wind_direction         0
wind_speed             0
precipitation          0
humidity          561876
year_month             0
dtype: int64

In [99]:
# 연월 평균으로 결측치 보완

for col in ['temperature', 'wind_direction', 'wind_speed', 'precipitation', 'humidity']:
    weather[col] = weather.groupby('year_month')[col].transform(
        lambda x: x.fillna(round(x.mean(), 1))
    )

In [100]:
weather.isnull().sum()

id                0
region            0
datetime          0
temperature       0
wind_direction    0
wind_speed        0
precipitation     0
humidity          0
year_month        0
dtype: int64

In [101]:
weather.drop(columns=['id', 'year_month'], axis=1, inplace=True)

In [102]:
station_to_gu = {
    '강남': '강남구',
    '서초': '서초구',
    '강동': '강동구',
    '송파': '송파구',
    '강서': '강서구',
    '양천': '양천구',
    '도봉': '도봉구',
    '노원': '노원구',
    '동대문': '동대문구',
    '중랑': '중랑구',
    '기상청': '동작구',
    '마포': '마포구',
    '서대문': '서대문구',
    '광진': '광진구',
    '성북': '성북구',
    '용산': '용산구',
    '은평': '은평구',
    '금천': '금천구',
    '한강': '영등포구',
    '중구': '중구',
    '성동': '성동구',
    '북악산': '종로구',
    '구로': '구로구',
    '강북*': '강북구',
    '남현': '관악구',
    '관악': '관악구',
    '영등포': '영등포구',
    '현충원': '동작구'
}

In [103]:
weather['region'] = weather['region'].map(station_to_gu)

In [104]:
weather.groupby(['datetime', 'region']).ngroups

2152241

In [105]:
weather_cleaned = weather.groupby(['datetime', 'region'], as_index=False).max().round(1)

In [106]:
weather_cleaned.head()

Unnamed: 0,datetime,region,temperature,wind_direction,wind_speed,precipitation,humidity
0,2015-01-01 01:00:00,강남구,-6.3,314.7,2.7,0.0,30.0
1,2015-01-01 01:00:00,강동구,-6.5,58.4,1.0,0.0,32.0
2,2015-01-01 01:00:00,강북구,-6.8,308.6,5.2,0.0,36.0
3,2015-01-01 01:00:00,강서구,-6.8,270.2,1.8,0.0,48.8
4,2015-01-01 01:00:00,관악구,-6.7,346.8,3.8,0.0,38.0


In [107]:
missing_df3 = get_missing_datetimes_by_region(weather_cleaned)
print(missing_df3)

📌 총 누락된 시간 수: 129,559개
       region            datetime
0         강남구 2015-08-14 17:00:00
1         강남구 2015-08-14 18:00:00
2         강남구 2015-08-14 19:00:00
3         강남구 2015-08-14 20:00:00
4         강남구 2015-08-14 21:00:00
...       ...                 ...
129554    중랑구 2024-12-31 20:00:00
129555    중랑구 2024-12-31 21:00:00
129556    중랑구 2024-12-31 22:00:00
129557    중랑구 2024-12-31 23:00:00
129558    중랑구 2025-01-01 00:00:00

[129559 rows x 2 columns]


In [108]:
# 2. 전체 시간 생성
full_time = pd.date_range(weather_cleaned['datetime'].min(), weather_cleaned['datetime'].max(), freq='h')
full_index = pd.MultiIndex.from_product(
    [weather_cleaned['region'].unique(), full_time],
    names=['region', 'datetime']
)

# 3. 기존 데이터 재배치 (누락 시간 포함)
weather_full = weather_cleaned.set_index(['region', 'datetime']).reindex(full_index).reset_index()

# 4. 월 평균 계산
weather_full['month'] = weather_full['datetime'].dt.to_period('M')
monthly_avg = weather_cleaned.copy()
monthly_avg['month'] = monthly_avg['datetime'].dt.to_period('M')

weather_cols = ['temperature', 'wind_direction', 'wind_speed', 'precipitation', 'humidity']

monthly_avg = monthly_avg.groupby(['region', 'month'])[weather_cols].mean().reset_index()

# 5. 평균값 병합 및 보간
weather_full = weather_full.merge(monthly_avg, on=['region', 'month'], how='left', suffixes=('', '_mean'))

for col in weather_cols:
    weather_full[col] = weather_full[col].fillna(weather_full[f"{col}_mean"])

# 6. 정리
weather_filled = weather_full.drop(columns=['month'] + [f"{col}_mean" for col in weather_cols])

In [109]:
missing_df4 = get_missing_datetimes_by_region(weather_filled)
print(missing_df4)

📌 총 누락된 시간 수: 0개
Empty DataFrame
Columns: []
Index: []


In [110]:
weather_cleaned = weather_filled.copy()

In [111]:
# china_yellow_dust 테이블 데이터 읽어오기
query3 = "SELECT * FROM china_yellow_dust"
yellow_dust = pd.read_sql(query3, con=engine)

# ▶ 결과 확인
yellow_dust.head()

Unnamed: 0,id,datetime,aod_avg,aod_max
0,1,2015-01-01,207.214,4000.0
1,2,2015-01-02,205.827,4000.0
2,3,2015-01-03,248.65,3727.0
3,4,2015-01-04,330.154,4000.0
4,5,2015-01-05,237.306,4000.0


In [112]:
yellow_dust.isnull().sum()

id          0
datetime    0
aod_avg     2
aod_max     1
dtype: int64

In [113]:
#월별 & 지역별 평균으로 결측치 대체

yellow_dust['datetime'] = pd.to_datetime(yellow_dust['datetime'])

yellow_dust['year_month'] = yellow_dust['datetime'].dt.to_period('M')

cols_to_fill = ['aod_avg', 'aod_max']

for col in cols_to_fill:
    yellow_dust[col] = yellow_dust.groupby(['year_month'])[col].transform(
        lambda x: x.fillna((x.mean()))
    )

In [114]:
yellow_dust.isnull().sum()

id            0
datetime      0
aod_avg       0
aod_max       0
year_month    0
dtype: int64

In [115]:
yellow_dust.drop(columns='id', inplace=True)
yellow_dust.drop(columns='year_month', inplace=True)

In [116]:
yellow_dust.head()

Unnamed: 0,datetime,aod_avg,aod_max
0,2015-01-01,207.214,4000.0
1,2015-01-02,205.827,4000.0
2,2015-01-03,248.65,3727.0
3,2015-01-04,330.154,4000.0
4,2015-01-05,237.306,4000.0


In [117]:
hours = pd.date_range("00:00", "23:00", freq="h").time

In [118]:
# 1. 날짜만 분리
yellow_dust['date'] = pd.to_datetime(yellow_dust['datetime']).dt.date

# 2. 00시~23시까지 시간 리스트 생성
hours = pd.date_range("00:00", "23:00", freq="h").time

# 3. 시간 데이터프레임
hour_df = pd.DataFrame({'time': hours})

# 4. 날짜 × 시간 조합 (카르테시안 곱)
expanded = yellow_dust.merge(hour_df, how='cross')

# 5. datetime 컬럼 재생성
expanded['datetime'] = pd.to_datetime(expanded['date'].astype(str) + ' ' + expanded['time'].astype(str))

# 6. 최종 컬럼 정리
expanded = expanded[['datetime', 'aod_avg', 'aod_max']].sort_values('datetime').reset_index(drop=True)

In [119]:
expanded
dust = expanded
dust.head()

Unnamed: 0,datetime,aod_avg,aod_max
0,2015-01-01 00:00:00,207.214,4000.0
1,2015-01-01 01:00:00,207.214,4000.0
2,2015-01-01 02:00:00,207.214,4000.0
3,2015-01-01 03:00:00,207.214,4000.0
4,2015-01-01 04:00:00,207.214,4000.0


In [217]:
merged = pd.merge(air_quality, weather_cleaned, on=['datetime', 'region'], how='inner')
merged = pd.merge(merged, dust, on='datetime', how='left')

In [222]:
merged

Unnamed: 0,datetime,region,pm10,pm25,temperature,wind_direction,wind_speed,precipitation,humidity,aod_avg,aod_max
0,2017-01-01 00:00:00,강남구,65,48,-0.4,195.2,1.8,0.0,54.1,343.393,4875.000
1,2017-01-01 01:00:00,강남구,63,48,1.4,69.1,1.3,0.0,78.0,343.393,4875.000
2,2017-01-01 02:00:00,강남구,61,44,1.2,66.2,1.6,0.0,80.0,343.393,4875.000
3,2017-01-01 03:00:00,강남구,71,44,0.5,66.0,1.5,0.0,84.0,343.393,4875.000
4,2017-01-01 04:00:00,강남구,79,44,0.6,63.4,0.5,0.0,86.0,343.393,4875.000
...,...,...,...,...,...,...,...,...,...,...,...
1533020,2023-12-30 20:00:00,중랑구,28,24,1.9,336.0,1.3,0.0,90.0,332.715,4023.000
1533021,2023-12-30 21:00:00,중랑구,29,25,2.2,24.2,2.2,0.0,89.0,332.715,4023.000
1533022,2023-12-30 22:00:00,중랑구,29,23,2.2,24.5,1.1,0.5,91.0,332.715,4023.000
1533023,2023-12-30 23:00:00,중랑구,32,23,2.3,350.8,1.7,0.0,93.0,332.715,4023.000


In [221]:
merged['pm10'] = merged['pm10'].astype(int)
merged['pm25'] = merged['pm25'].astype(int)

merged['temperature'] = merged['temperature'].round(1)
merged['wind_direction'] = merged['wind_direction'].round(1)
merged['wind_speed'] = merged['wind_speed'].round(1)
merged['precipitation'] = merged['precipitation'].round(1)
merged['humidity'] = merged['humidity'].round(1)

merged['aod_avg'] = merged['aod_avg'].round(3)
merged['aod_max'] = merged['aod_max'].round(3)

In [223]:
merged.to_csv('미세먼지용_데이터셋_전기차제외.csv', index=False, encoding='utf-8')

In [224]:
# vehicle_reg_stats 테이블 데이터 읽어오기
query4 = "SELECT * FROM vehicle_reg_stats"
vehicle = pd.read_sql(query4, con=engine)

# ▶ 결과 확인
vehicle.head()

Unnamed: 0,id,datetime,region,fuel_type,use_type,reg_count
0,1,2017-01-01,강원,경유,사업용,12362.0
1,2,2017-01-01,광주,전기,비사업용,240.0
2,3,2017-01-01,인천,전기,사업용,6.0
3,4,2017-01-01,인천,전기,비사업용,210.0
4,5,2017-01-01,대구,전기,사업용,53.0


In [225]:
vehicle['datetime'] = pd.to_datetime(vehicle['datetime'], errors='coerce')

In [226]:
# 1. 서울만 필터링 (2018년까지)
vehicle_pre_2019 = vehicle[(vehicle['datetime'] <= '2018-12-31') & (vehicle['region'] == '서울')].copy()

In [227]:
# 2. 2019년 1월 데이터 중 구별 연료 비율 구하기
vehicle_over_2019 = vehicle[vehicle['datetime'] >= '2019-01-01'].copy()
base = vehicle_over_2019[vehicle_over_2019['datetime'] == '2019-01-01']
base_ratio = base.groupby(['region', 'fuel_type'])['reg_count'].sum()
base_ratio = base_ratio / base_ratio.groupby('fuel_type').transform('sum')
base_ratio = base_ratio.reset_index().rename(columns={'reg_count': 'ratio'})

In [228]:
# 3. 서울 데이터를 각 구별로 비율 분배
vehicle_expanded = pd.merge(vehicle_pre_2019, base_ratio, on='fuel_type', how='left')
vehicle_expanded['reg_count'] = vehicle_expanded['reg_count'] * vehicle_expanded['ratio']

In [230]:
vehicle_expanded = vehicle_expanded.drop(columns=['region_x'])  # 원래 '서울' 제거
vehicle_expanded['region'] = vehicle_expanded['region_y']     # 새로운 구 적용
vehicle_expanded = vehicle_expanded[['datetime', 'region', 'fuel_type', 'reg_count', 'use_type']]

In [233]:
vehicle = pd.concat([vehicle_expanded, vehicle_over_2019], ignore_index=True)

In [235]:
vehicle.head()

Unnamed: 0,datetime,region,fuel_type,reg_count,use_type
0,2017-01-01,강남구,전기,314.162992,사업용
1,2017-01-01,강동구,전기,13.351374,사업용
2,2017-01-01,강북구,전기,6.343747,사업용
3,2017-01-01,강서구,전기,20.211472,사업용
4,2017-01-01,관악구,전기,10.843381,사업용


In [237]:
from pandas.tseries.offsets import MonthBegin

# 복제 대상 추출 (2020, 2021년 12월)
mask = vehicle['datetime'].isin(['2020-12-01', '2021-12-01'])
base_df = vehicle[mask].copy()

# 확장할 월 목록 생성
expand_months = pd.date_range('2020-01-01', '2021-11-01', freq='MS')
expand_months = expand_months[~expand_months.isin(['2020-12-01', '2021-12-01'])]

# 월별로 복제
expanded_list = []
for month in expand_months:
    year = month.year
    base_month = pd.Timestamp(f"{year}-12-01")
    monthly_base = vehicle[vehicle['datetime'] == base_month].copy()
    monthly_base['datetime'] = pd.Timestamp(month)
    expanded_list.append(monthly_base)

# 병합
vehicle_augmented = pd.concat([vehicle] + expanded_list, ignore_index=True)
vehicle_augmented = vehicle_augmented.sort_values(['datetime', 'region']).reset_index(drop=True)

  mask = vehicle['datetime'].isin(['2020-12-01', '2021-12-01'])
  expand_months = expand_months[~expand_months.isin(['2020-12-01', '2021-12-01'])]


In [240]:
vehicle = vehicle_augmented.copy()

In [241]:
vehicle.isnull().sum()

datetime     0
region       0
fuel_type    0
reg_count    0
use_type     0
month        0
dtype: int64

In [243]:
vehicle.drop(['month'], axis=1, inplace=True)

In [245]:
#사업용 비사업용 합계

vehicle = vehicle.groupby(['datetime', 'region', 'fuel_type'], as_index=False)['reg_count'].sum()
vehicle.head()

Unnamed: 0,datetime,region,fuel_type,reg_count
0,2017-01-01,강남구,경유,70907.135412
1,2017-01-01,강남구,전기,707.089228
2,2017-01-01,강남구,휘발유,158177.611443
3,2017-01-01,강동구,경유,48147.990482
4,2017-01-01,강동구,전기,30.050047


In [246]:
vehicle_grouped = vehicle.groupby(['datetime', 'region', 'fuel_type'])['reg_count'].sum().reset_index()

vehicle_pivoted = vehicle_grouped.pivot(
    index=['datetime', 'region'], 
    columns='fuel_type', 
    values='reg_count'
).reset_index()

vehicle_pivoted.columns.name = None

cols_to_convert = ['경유', '전기', '휘발유']
vehicle_pivoted[cols_to_convert] = vehicle_pivoted[cols_to_convert].astype(int)

In [247]:
vehicle_pivoted.head()

Unnamed: 0,datetime,region,경유,전기,휘발유
0,2017-01-01,강남구,70907,707,158177
1,2017-01-01,강동구,48147,30,64751
2,2017-01-01,강북구,28800,14,32384
3,2017-01-01,강서구,71883,45,94317
4,2017-01-01,관악구,43216,24,57325


In [248]:
vehicle = vehicle_pivoted.rename(columns={
    '경유': 'diesel',
    '전기': 'electric',
    '휘발유': 'gasoline'
})

vehicle.head()

Unnamed: 0,datetime,region,diesel,electric,gasoline
0,2017-01-01,강남구,70907,707,158177
1,2017-01-01,강동구,48147,30,64751
2,2017-01-01,강북구,28800,14,32384
3,2017-01-01,강서구,71883,45,94317
4,2017-01-01,관악구,43216,24,57325


In [249]:
# 원래 서울 전체 차량 수 합계 (2019년 이전)
original_total = vehicle_pre_2019.groupby('fuel_type')['reg_count'].sum().reset_index(name='original_total')

# 구별로 분배된 차량 수 합계
expanded_total = vehicle_expanded.groupby('fuel_type')['reg_count'].sum().reset_index(name='expanded_total')

# 두 결과 병합해서 비교
compare_df = pd.merge(original_total, expanded_total, on='fuel_type')
compare_df['difference'] = compare_df['original_total'] - compare_df['expanded_total']
compare_df['relative_error (%)'] = (compare_df['difference'] / compare_df['original_total']) * 100

compare_df

Unnamed: 0,fuel_type,original_total,expanded_total,difference,relative_error (%)
0,경유,27224870.0,27224870.0,0.0,0.0
1,전기,114732.0,114732.0,0.0,0.0
2,휘발유,38384823.0,38384823.0,0.0,0.0


In [250]:
vehicle['year'] = pd.to_datetime(vehicle['datetime']).dt.year
vehicle['month'] = pd.to_datetime(vehicle['datetime']).dt.month

In [251]:
from pandas.tseries.offsets import MonthEnd

vehicle['start_date'] = pd.to_datetime(vehicle['datetime'])  # 1일
vehicle['end_date'] = vehicle['start_date'] + MonthEnd(0)   # 말일

# 일자 리스트 생성
date_rows = []

for _, row in vehicle.iterrows():
    days = pd.date_range(start=row['start_date'], end=row['end_date'], freq='D')
    for day in days:
        date_rows.append({
            'datetime': day,
            'diesel': row['diesel'],
            'region': row['region'],
            'electric': row['electric'],
            'gasoline': row['gasoline']
        })

vehicle = pd.DataFrame(date_rows)

In [252]:
vehicle.head()

Unnamed: 0,datetime,diesel,region,electric,gasoline
0,2017-01-01,70907,강남구,707,158177
1,2017-01-02,70907,강남구,707,158177
2,2017-01-03,70907,강남구,707,158177
3,2017-01-04,70907,강남구,707,158177
4,2017-01-05,70907,강남구,707,158177


In [253]:
# 1. 날짜만 추출하고 datetime64 유지
vehicle['date_only'] = pd.to_datetime(vehicle['datetime']).dt.normalize()

# 2. 시간 리스트 생성 (00:00 ~ 23:00)
hour_list = pd.date_range("00:00", "23:00", freq="h").time
hour_df = pd.DataFrame({'hour': hour_list})

# 3. 날짜 x 시간 cross merge
vehicle_expanded = vehicle.merge(hour_df, how='cross')

# 4. datetime 결합
vehicle_expanded['datetime'] = pd.to_datetime(
    vehicle_expanded['date_only'].astype(str) + ' ' + vehicle_expanded['hour'].astype(str)
)

# 5. 불필요한 컬럼 제거
vehicle_expanded = vehicle_expanded.drop(columns=['date_only', 'hour'])

# 6. 정렬
vehicle_expanded = vehicle_expanded.sort_values(['region', 'datetime']).reset_index(drop=True)

# ✅ 확인
print(vehicle_expanded[['region', 'datetime']].head(30))
print(vehicle_expanded['datetime'].dt.hour.unique())  # 0~23이 다 나와야 정상

   region            datetime
0     강남구 2017-01-01 00:00:00
1     강남구 2017-01-01 01:00:00
2     강남구 2017-01-01 02:00:00
3     강남구 2017-01-01 03:00:00
4     강남구 2017-01-01 04:00:00
5     강남구 2017-01-01 05:00:00
6     강남구 2017-01-01 06:00:00
7     강남구 2017-01-01 07:00:00
8     강남구 2017-01-01 08:00:00
9     강남구 2017-01-01 09:00:00
10    강남구 2017-01-01 10:00:00
11    강남구 2017-01-01 11:00:00
12    강남구 2017-01-01 12:00:00
13    강남구 2017-01-01 13:00:00
14    강남구 2017-01-01 14:00:00
15    강남구 2017-01-01 15:00:00
16    강남구 2017-01-01 16:00:00
17    강남구 2017-01-01 17:00:00
18    강남구 2017-01-01 18:00:00
19    강남구 2017-01-01 19:00:00
20    강남구 2017-01-01 20:00:00
21    강남구 2017-01-01 21:00:00
22    강남구 2017-01-01 22:00:00
23    강남구 2017-01-01 23:00:00
24    강남구 2017-01-02 00:00:00
25    강남구 2017-01-02 01:00:00
26    강남구 2017-01-02 02:00:00
27    강남구 2017-01-02 03:00:00
28    강남구 2017-01-02 04:00:00
29    강남구 2017-01-02 05:00:00
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 

In [254]:
vehicle_expanded.head()

Unnamed: 0,datetime,diesel,region,electric,gasoline
0,2017-01-01 00:00:00,70907,강남구,707,158177
1,2017-01-01 01:00:00,70907,강남구,707,158177
2,2017-01-01 02:00:00,70907,강남구,707,158177
3,2017-01-01 03:00:00,70907,강남구,707,158177
4,2017-01-01 04:00:00,70907,강남구,707,158177


In [255]:
start_date = '2017-01-01'
end_date = '2023-12-31'

air_quality = air_quality[(air_quality['datetime'] >= start_date) & (air_quality['datetime'] <= end_date)]
weather_cleaned = weather_cleaned[(weather_cleaned['datetime'] >= start_date) & (weather_cleaned['datetime'] <= end_date)]
dust = dust[(dust['datetime'] >= start_date) & (dust['datetime'] <= end_date)]
vehicle = vehicle_expanded[(vehicle_expanded['datetime'] >= start_date) & (vehicle_expanded['datetime'] <= end_date)]

In [256]:
#merged = pd.merge(air_quality, weather_cleaned, on=['datetime', 'region'], how='inner')
#merged = pd.merge(merged, dust, on='datetime', how='left')
final_df = pd.merge(merged, vehicle, on=['datetime', 'region'], how='left')

In [257]:
final_df.head()

Unnamed: 0,datetime,region,pm10,pm25,temperature,wind_direction,wind_speed,precipitation,humidity,aod_avg,aod_max,diesel,electric,gasoline
0,2017-01-01 00:00:00,강남구,65,48,-0.4,195.2,1.8,0.0,54.1,343.393,4875.0,70907,707,158177
1,2017-01-01 01:00:00,강남구,63,48,1.4,69.1,1.3,0.0,78.0,343.393,4875.0,70907,707,158177
2,2017-01-01 02:00:00,강남구,61,44,1.2,66.2,1.6,0.0,80.0,343.393,4875.0,70907,707,158177
3,2017-01-01 03:00:00,강남구,71,44,0.5,66.0,1.5,0.0,84.0,343.393,4875.0,70907,707,158177
4,2017-01-01 04:00:00,강남구,79,44,0.6,63.4,0.5,0.0,86.0,343.393,4875.0,70907,707,158177


In [258]:
final_df['diesel'] = final_df['diesel'].astype(int)
final_df['electric'] = final_df['electric'].astype(int)
final_df['gasoline'] = final_df['gasoline'].astype(int)

In [259]:
final_df.to_csv('미세먼지용_데이터셋_ver5.csv', index=False, encoding='utf-8')