In [17]:
import pandas as pd
from sqlalchemy import create_engine
import json

# 접속 정보 로딩 (옵션: db-config.json 파일이 있을 경우)
with open('db-config.json') as f:
    config = json.load(f)

user = config['user']
password = config['password']
host = config['host']
port = config['port']
database = config['database']

# SQLAlchemy 엔진 생성
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}?charset=utf8mb4")

# air_quality 테이블 데이터 읽어오기
query = "SELECT * FROM air_quality"
air_quality = pd.read_sql(query, con=engine)

# ▶ 결과 확인
air_quality.head()

Unnamed: 0,id,datetime,region,pm10,pm25
0,1,2015-12-31 23:00:00,평균,93.0,62.0
1,2,2015-12-31 23:00:00,강남구,107.0,70.0
2,3,2015-12-31 23:00:00,강동구,91.0,51.0
3,4,2015-12-31 23:00:00,강북구,92.0,64.0
4,5,2015-12-31 23:00:00,강서구,96.0,51.0


In [18]:
air_quality.isnull().sum()

id              0
datetime        0
region          0
pm10        73631
pm25        70223
dtype: int64

In [19]:
#월별 & 지역별 평균으로 결측치 대체

air_quality['datetime'] = pd.to_datetime(air_quality['datetime'])

air_quality['year_month'] = air_quality['datetime'].dt.to_period('M')

cols_to_fill = ['pm10', 'pm25']

for col in cols_to_fill:
    air_quality[col] = air_quality.groupby(['year_month', 'region'])[col].transform(
        lambda x: x.fillna(x.mean())
    )

In [20]:
air_quality.isnull().sum()

id              0
datetime        0
region          0
pm10          720
pm25          720
year_month      0
dtype: int64

In [21]:
# 연월 평균으로 결측치 보완

for col in ['pm10', 'pm25']:
    air_quality[col] = air_quality.groupby('year_month')[col].transform(
        lambda x: x.fillna(x.mean())
    )

In [22]:
air_quality.isnull().sum()

id            0
datetime      0
region        0
pm10          0
pm25          0
year_month    0
dtype: int64

In [23]:
air_quality.drop(columns='id', inplace=True)

In [24]:
air_quality.drop(columns='year_month', inplace=True)

In [25]:
air_quality['pm10'] = air_quality['pm10'].astype(int)
air_quality['pm25'] = air_quality['pm25'].astype(int)

In [26]:
air_quality.head()

Unnamed: 0,datetime,region,pm10,pm25
0,2015-12-31 23:00:00,평균,93,62
1,2015-12-31 23:00:00,강남구,107,70
2,2015-12-31 23:00:00,강동구,91,51
3,2015-12-31 23:00:00,강북구,92,64
4,2015-12-31 23:00:00,강서구,96,51


In [27]:
# weather 테이블 데이터 읽어오기
query2 = "SELECT * FROM weather"
weather = pd.read_sql(query2, con=engine)

# ▶ 결과 확인
weather.head()

Unnamed: 0,id,station_id,station_name,datetime,temperature,wind_direction,wind_speed,precipitation,humidity
0,1,400,강남,2015-01-01 01:00:00,-6.3,314.7,2.7,0.0,30.0
1,2,400,강남,2015-01-01 02:00:00,-6.9,295.6,3.0,0.0,33.0
2,3,400,강남,2015-01-01 03:00:00,-7.3,334.4,2.7,0.0,34.0
3,4,400,강남,2015-01-01 04:00:00,-7.7,320.1,2.9,0.0,33.0
4,5,400,강남,2015-01-01 05:00:00,-7.9,294.6,2.3,0.0,33.0


In [29]:
weather.drop(['station_id'], axis=1, inplace=True)
weather.rename(columns={"station_name": "region"}, inplace=True)

In [30]:
weather.isnull().sum()

id                     0
region                 0
datetime               0
temperature         5696
wind_direction      6460
wind_speed          5936
precipitation      17412
humidity          633811
dtype: int64

In [31]:
#월별 & 지역별 평균으로 결측치 대체

weather['datetime'] = pd.to_datetime(weather['datetime'])

weather['year_month'] = weather['datetime'].dt.to_period('M')

cols_to_fill = ['temperature', 'wind_direction', 'wind_speed', 'precipitation', 'humidity']

for col in cols_to_fill:
    weather[col] = weather.groupby(['year_month', 'region'])[col].transform(
        lambda x: x.fillna(round(x.mean(), 1))
    )

In [32]:
weather.isnull().sum()

id                     0
region                 0
datetime               0
temperature           15
wind_direction         0
wind_speed             0
precipitation          0
humidity          561876
year_month             0
dtype: int64

In [33]:
# 연월 평균으로 결측치 보완

for col in ['temperature', 'wind_direction', 'wind_speed', 'precipitation', 'humidity']:
    weather[col] = weather.groupby('year_month')[col].transform(
        lambda x: x.fillna(round(x.mean(), 1))
    )

In [34]:
weather.isnull().sum()

id                0
region            0
datetime          0
temperature       0
wind_direction    0
wind_speed        0
precipitation     0
humidity          0
year_month        0
dtype: int64

In [35]:
weather.drop(columns=['id', 'year_month'], axis=1, inplace=True)

In [36]:
station_to_gu = {
    '강남': '강남구',
    '서초': '서초구',
    '강동': '강동구',
    '송파': '송파구',
    '강서': '강서구',
    '양천': '양천구',
    '도봉': '도봉구',
    '노원': '노원구',
    '동대문': '동대문구',
    '중랑': '중랑구',
    '기상청': '동작구',
    '마포': '마포구',
    '서대문': '서대문구',
    '광진': '광진구',
    '성북': '성북구',
    '용산': '용산구',
    '은평': '은평구',
    '금천': '금천구',
    '한강': '영등포구',
    '중구': '중구',
    '성동': '성동구',
    '북악산': '종로구',
    '구로': '구로구',
    '강북*': '강북구',
    '남현': '관악구',
    '관악': '관악구',
    '영등포': '영등포구',
    '현충원': '동작구'
}

In [37]:
weather['region'] = weather['region'].map(station_to_gu)

In [38]:
weather.groupby(['datetime', 'region']).ngroups

2152241

In [39]:
weather_cleaned = weather.groupby(['datetime', 'region'], as_index=False).mean().round(1)

In [40]:
weather_cleaned.head()

Unnamed: 0,datetime,region,temperature,wind_direction,wind_speed,precipitation,humidity
0,2015-01-01 01:00:00,강남구,-6.3,314.7,2.7,0.0,30.0
1,2015-01-01 01:00:00,강동구,-6.5,58.4,1.0,0.0,32.0
2,2015-01-01 01:00:00,강북구,-6.8,308.6,5.2,0.0,36.0
3,2015-01-01 01:00:00,강서구,-6.8,270.2,1.8,0.0,48.8
4,2015-01-01 01:00:00,관악구,-6.9,179.4,3.6,0.0,19.0


In [41]:
# china_yellow_dust 테이블 데이터 읽어오기
query3 = "SELECT * FROM china_yellow_dust"
yellow_dust = pd.read_sql(query3, con=engine)

# ▶ 결과 확인
yellow_dust.head()

Unnamed: 0,id,datetime,aod_avg,aod_max
0,1,2015-01-01,207.214,4000.0
1,2,2015-01-02,205.827,4000.0
2,3,2015-01-03,248.65,3727.0
3,4,2015-01-04,330.154,4000.0
4,5,2015-01-05,237.306,4000.0


In [42]:
yellow_dust.isnull().sum()

id          0
datetime    0
aod_avg     2
aod_max     1
dtype: int64

In [43]:
#월별 & 지역별 평균으로 결측치 대체

yellow_dust['datetime'] = pd.to_datetime(yellow_dust['datetime'])

yellow_dust['year_month'] = yellow_dust['datetime'].dt.to_period('M')

cols_to_fill = ['aod_avg', 'aod_max']

for col in cols_to_fill:
    yellow_dust[col] = yellow_dust.groupby(['year_month'])[col].transform(
        lambda x: x.fillna((x.mean()))
    )

In [44]:
yellow_dust.isnull().sum()

id            0
datetime      0
aod_avg       0
aod_max       0
year_month    0
dtype: int64

In [45]:
yellow_dust.drop(columns='id', inplace=True)
yellow_dust.drop(columns='year_month', inplace=True)

In [46]:
yellow_dust.head()

Unnamed: 0,datetime,aod_avg,aod_max
0,2015-01-01,207.214,4000.0
1,2015-01-02,205.827,4000.0
2,2015-01-03,248.65,3727.0
3,2015-01-04,330.154,4000.0
4,2015-01-05,237.306,4000.0


In [47]:
hours = pd.date_range("00:00", "23:00", freq="h").time

In [48]:
# 1. 날짜만 분리
yellow_dust['date'] = pd.to_datetime(yellow_dust['datetime']).dt.date

# 2. 00시~23시까지 시간 리스트 생성
hours = pd.date_range("00:00", "23:00", freq="h").time

# 3. 시간 데이터프레임
hour_df = pd.DataFrame({'time': hours})

# 4. 날짜 × 시간 조합 (카르테시안 곱)
expanded = yellow_dust.merge(hour_df, how='cross')

# 5. datetime 컬럼 재생성
expanded['datetime'] = pd.to_datetime(expanded['date'].astype(str) + ' ' + expanded['time'].astype(str))

# 6. 최종 컬럼 정리
expanded = expanded[['datetime', 'aod_avg', 'aod_max']].sort_values('datetime').reset_index(drop=True)

In [49]:
expanded
dust = expanded
dust.head()

Unnamed: 0,datetime,aod_avg,aod_max
0,2015-01-01 00:00:00,207.214,4000.0
1,2015-01-01 01:00:00,207.214,4000.0
2,2015-01-01 02:00:00,207.214,4000.0
3,2015-01-01 03:00:00,207.214,4000.0
4,2015-01-01 04:00:00,207.214,4000.0


In [50]:
# vehicle_reg_stats 테이블 데이터 읽어오기
query4 = "SELECT * FROM vehicle_reg_stats"
vehicle = pd.read_sql(query4, con=engine)

# ▶ 결과 확인
vehicle.head()

Unnamed: 0,id,datetime,region,fuel_type,use_type,reg_count
0,136,2017-01-01,강원,경유,사업용,12362.0
1,137,2017-01-01,광주,전기,비사업용,240.0
2,138,2017-01-01,인천,전기,사업용,6.0
3,139,2017-01-01,인천,전기,비사업용,210.0
4,140,2017-01-01,대구,전기,사업용,53.0


In [51]:
seoul = vehicle[vehicle['region'] == '서울']
seoul.head()

Unnamed: 0,id,datetime,region,fuel_type,use_type,reg_count
8,144,2017-01-01,서울,전기,사업용,706.0
9,145,2017-01-01,서울,전기,비사업용,883.0
97,233,2017-01-01,서울,경유,사업용,70175.0
98,234,2017-01-01,서울,경유,비사업용,1042523.0
104,240,2017-01-01,서울,휘발유,사업용,18396.0


In [52]:
seoul.isnull().sum()

id           0
datetime     0
region       0
fuel_type    0
use_type     0
reg_count    0
dtype: int64

In [53]:
#사업용 비사업용 합계

summary = seoul.groupby(['datetime', 'region', 'fuel_type'], as_index=False)['reg_count'].sum()
summary.head()

Unnamed: 0,datetime,region,fuel_type,reg_count
0,2017-01-01,서울,경유,1112698.0
1,2017-01-01,서울,전기,1589.0
2,2017-01-01,서울,휘발유,1600398.0
3,2017-02-01,서울,경유,1115138.0
4,2017-02-01,서울,전기,1649.0


In [54]:
seoul_wide = summary.pivot(index='datetime', columns='fuel_type', values='reg_count')

seoul_wide = seoul_wide.reset_index().rename_axis(columns=None)

seoul_wide.head()

Unnamed: 0,datetime,경유,전기,휘발유
0,2017-01-01,1112698.0,1589.0,1600398.0
1,2017-02-01,1115138.0,1649.0,1600615.0
2,2017-03-01,1116960.0,1692.0,1601053.0
3,2017-04-01,1119341.0,1714.0,1602253.0
4,2017-05-01,1122534.0,1932.0,1602947.0


In [55]:
seoul_wide = seoul_wide.rename(columns={
    '경유': 'diesel',
    '전기': 'electric',
    '휘발유': 'gasoline'
})

seoul_wide.head()

Unnamed: 0,datetime,diesel,electric,gasoline
0,2017-01-01,1112698.0,1589.0,1600398.0
1,2017-02-01,1115138.0,1649.0,1600615.0
2,2017-03-01,1116960.0,1692.0,1601053.0
3,2017-04-01,1119341.0,1714.0,1602253.0
4,2017-05-01,1122534.0,1932.0,1602947.0


In [56]:
seoul_wide['diesel'] = seoul_wide['diesel'].astype(int)
seoul_wide['electric'] = seoul_wide['electric'].astype(int)
seoul_wide['gasoline'] = seoul_wide['gasoline'].astype(int)

seoul_wide.head()

Unnamed: 0,datetime,diesel,electric,gasoline
0,2017-01-01,1112698,1589,1600398
1,2017-02-01,1115138,1649,1600615
2,2017-03-01,1116960,1692,1601053
3,2017-04-01,1119341,1714,1602253
4,2017-05-01,1122534,1932,1602947


In [57]:
seoul_wide['year'] = pd.to_datetime(seoul_wide['datetime']).dt.year
seoul_wide['month'] = pd.to_datetime(seoul_wide['datetime']).dt.month

In [58]:
from pandas.tseries.offsets import MonthEnd

seoul_wide['start_date'] = pd.to_datetime(seoul_wide['datetime'])  # 1일
seoul_wide['end_date'] = seoul_wide['start_date'] + MonthEnd(0)   # 말일

# 일자 리스트 생성
date_rows = []

for _, row in seoul_wide.iterrows():
    days = pd.date_range(start=row['start_date'], end=row['end_date'], freq='D')
    for day in days:
        date_rows.append({
            'datetime': day,
            'diesel': row['diesel'],
            'electric': row['electric'],
            'gasoline': row['gasoline']
        })

seoul_vehicle = pd.DataFrame(date_rows)

In [59]:
seoul_vehicle.head()

Unnamed: 0,datetime,diesel,electric,gasoline
0,2017-01-01,1112698,1589,1600398
1,2017-01-02,1112698,1589,1600398
2,2017-01-03,1112698,1589,1600398
3,2017-01-04,1112698,1589,1600398
4,2017-01-05,1112698,1589,1600398


In [60]:
# 00:00 ~ 23:00 시간대 생성
hours = pd.date_range("00:00", "23:00", freq="H").time
hour_df = pd.DataFrame({'time': hours})

# 일 × 시 조합
vehicle = seoul_vehicle.merge(hour_df, how='cross')

# datetime 결합
vehicle['datetime'] = pd.to_datetime(vehicle['datetime'].astype(str) + ' ' + vehicle['time'].astype(str))

# 컬럼 정리
vehicle = vehicle.drop(columns='time').sort_values('datetime').reset_index(drop=True)

  hours = pd.date_range("00:00", "23:00", freq="H").time


In [61]:
vehicle.head()

Unnamed: 0,datetime,diesel,electric,gasoline
0,2017-01-01 00:00:00,1112698,1589,1600398
1,2017-01-01 01:00:00,1112698,1589,1600398
2,2017-01-01 02:00:00,1112698,1589,1600398
3,2017-01-01 03:00:00,1112698,1589,1600398
4,2017-01-01 04:00:00,1112698,1589,1600398


In [62]:
start_date = '2017-01-01'
end_date = '2023-12-31'

air_quality = air_quality[(air_quality['datetime'] >= start_date) & (air_quality['datetime'] <= end_date)]
weather_cleaned = weather_cleaned[(weather_cleaned['datetime'] >= start_date) & (weather_cleaned['datetime'] <= end_date)]
dust = dust[(dust['datetime'] >= start_date) & (dust['datetime'] <= end_date)]
vehicle = vehicle[(vehicle['datetime'] >= start_date) & (vehicle['datetime'] <= end_date)]

In [63]:
for df in [air_quality, weather_cleaned, dust, vehicle]:
    df['datetime'] = pd.to_datetime(df['datetime'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['datetime'] = pd.to_datetime(df['datetime'])


In [64]:
# 1. air_quality + weather
merged = pd.merge(air_quality, weather_cleaned, on=['datetime', 'region'], how='inner')

merged = pd.merge(merged, dust, on='datetime', how='left')

final_df = merged.copy()

In [70]:
region_num =25

import numpy as np

vehicle_scaled = vehicle.copy()
vehicle_scaled[['diesel', 'gasoline', 'electric']] = (np.round(vehicle_scaled[['diesel', 'gasoline', 'electric']] / 25)).astype(int)

In [71]:
vehicle_scaled.head()

Unnamed: 0,datetime,diesel,electric,gasoline
0,2017-01-01 00:00:00,44508,64,64016
1,2017-01-01 01:00:00,44508,64,64016
2,2017-01-01 02:00:00,44508,64,64016
3,2017-01-01 03:00:00,44508,64,64016
4,2017-01-01 04:00:00,44508,64,64016


In [72]:
final_df = pd.merge(final_df, vehicle_scaled, on='datetime', how='left')
final_df

Unnamed: 0,datetime,region,pm10,pm25,temperature,wind_direction,wind_speed,precipitation,humidity,aod_avg,aod_max,diesel,electric,gasoline
0,2019-12-31 00:00:00,강남구,22,14,-4.1,286.3,3.5,0.0,50.0,293.569,4000.0,44877,598,64317
1,2019-12-31 00:00:00,강동구,30,17,-4.0,264.9,2.1,0.0,54.0,293.569,4000.0,44877,598,64317
2,2019-12-31 00:00:00,강북구,46,20,-5.6,328.0,5.9,0.0,65.0,293.569,4000.0,44877,598,64317
3,2019-12-31 00:00:00,강서구,32,17,-5.4,324.2,5.1,0.0,65.0,293.569,4000.0,44877,598,64317
4,2019-12-31 00:00:00,관악구,26,13,-5.2,331.4,3.4,0.0,62.0,293.569,4000.0,44877,598,64317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1419566,2023-01-01 01:00:00,영등포구,53,36,2.7,189.2,1.1,0.0,61.0,372.609,8034.5,42181,2385,66247
1419567,2023-01-01 01:00:00,용산구,51,38,1.8,184.8,1.1,0.0,62.0,372.609,8034.5,42181,2385,66247
1419568,2023-01-01 01:00:00,은평구,62,39,-1.5,0.0,0.0,0.0,82.0,372.609,8034.5,42181,2385,66247
1419569,2023-01-01 01:00:00,중구,57,52,0.6,294.4,3.4,0.0,68.0,372.609,8034.5,42181,2385,66247


In [73]:
final_df.to_csv('미세먼지용_데이터셋_ver3.csv', index=False, encoding='utf-8')