In [101]:
import pandas as pd
from sqlalchemy import create_engine
import json

# 접속 정보 로딩 (옵션: db-config.json 파일이 있을 경우)
with open('db-config.json') as f:
    config = json.load(f)

user = config['user']
password = config['password']
host = config['host']
port = config['port']
database = config['database']

# SQLAlchemy 엔진 생성
engine = create_engine(f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}")

# weather 테이블 데이터 읽어오기
query_pollution = "SELECT * FROM air_pollution WHERE DATE(datetime) BETWEEN '2017-01-01' AND '2023-12-31'"
query_weather = "SELECT * FROM weather WHERE DATE(datetime) BETWEEN '2017-01-01' AND '2023-12-31'"
query_quality = "SELECT * FROM air_quality WHERE DATE(datetime) BETWEEN '2017-01-01' AND '2023-12-31'"
query_vehicle = "SELECT * FROM vehicle_reg_stats WHERE DATE(datetime) BETWEEN '2017-01-01' AND '2023-12-31'"
# query_vehicle = "SELECT * FROM vehicle_reg_stats WHERE DATE(datetime) BETWEEN '2017-01-01' AND '2023-12-31'"

df_pollution = pd.read_sql(query_pollution, con=engine)
df_weather = pd.read_sql(query_weather, con=engine)
df_quality = pd.read_sql(query_quality, con=engine)
df_vehicle = pd.read_sql(query_vehicle, con=engine)

In [102]:
df_quality.isna().sum()

id              0
datetime        0
region          0
pm10        44421
pm25        36312
dtype: int64

In [103]:
for col in ['pm10','pm25']:
    mean_val = df_quality[col].mean()
    df_quality[col] = df_quality[col].fillna(mean_val)

In [104]:
df_quality.isna().sum()

id          0
datetime    0
region      0
pm10        0
pm25        0
dtype: int64

In [105]:
df_quality

Unnamed: 0,id,datetime,region,pm10,pm25
0,907342,2019-12-31 23:00:00,평균,23.0,15.0
1,907343,2019-12-31 23:00:00,강남구,19.0,15.0
2,907344,2019-12-31 23:00:00,강동구,25.0,18.0
3,907345,2019-12-31 23:00:00,강북구,25.0,16.0
4,907346,2019-12-31 23:00:00,강서구,29.0,17.0
...,...,...,...,...,...
1594937,2730539,2023-01-01 00:00:00,용산구,51.0,43.0
1594938,2730540,2023-01-01 00:00:00,은평구,54.0,37.0
1594939,2730541,2023-01-01 00:00:00,종로구,76.0,67.0
1594940,2730542,2023-01-01 00:00:00,중구,63.0,61.0


In [106]:
df_weather

Unnamed: 0,id,station_id,station_name,datetime,temperature,wind_direction,wind_speed,precipitation,humidity
0,2076728,424,강북*,2017-01-01 01:00:00,1.4,338.8,0.5,0.0,81.0000
1,2076729,422,북악산,2017-01-01 01:00:00,0.9,129.6,1.2,0.0,92.0000
2,2076730,889,현충원,2017-01-01 01:00:00,-1.4,229.9,0.6,0.0,95.0000
3,2076731,402,강동,2017-01-01 01:00:00,-0.2,358.9,1.3,0.0,88.0000
4,2076732,415,용산,2017-01-01 01:00:00,1.8,32.5,0.8,0.0,63.5432
...,...,...,...,...,...,...,...,...,...
1587852,3664580,400,강남,2023-12-31 00:00:00,2.5,82.2,1.0,0.5,95.0000
1587853,3664581,416,은평,2023-12-31 00:00:00,2.8,61.0,1.1,0.0,100.0000
1587854,3664582,405,양천,2023-12-31 00:00:00,2.9,128.5,1.5,0.5,95.0000
1587855,3664583,413,광진,2023-12-31 00:00:00,2.4,132.5,0.8,0.0,94.0000


In [107]:
df_weather.drop(['station_id'],axis=1,inplace=True)
df_weather.rename(columns={"station_name":"region"},inplace=True)

In [108]:
df_weather['region'].unique()

array(['강북*', '북악산', '현충원', '강동', '용산', '중랑', '강남', '구로', '마포', '성동',
       '남현', '관악', '송파', '기상청', '강서', '성북', '금천', '중구', '서대문', '한강', '광진',
       '양천', '은평', '노원', '서초', '영등포', '동대문', '도봉'], dtype=object)

In [109]:
df_weather.isna().sum()

id                0
region            0
datetime          0
temperature       0
wind_direction    0
wind_speed        0
precipitation     0
humidity          0
dtype: int64

In [110]:
df_weather.drop(columns=['id'], axis=1, inplace=True)

In [111]:
station_to_gu = {
    '강남': '강남구',
    '서초': '서초구',
    '강동': '강동구',
    '송파': '송파구',
    '강서': '강서구',
    '양천': '양천구',
    '도봉': '도봉구',
    '노원': '노원구',
    '동대문': '동대문구',
    '중랑': '중랑구',
    '기상청': '동작구',
    '마포': '마포구',
    '서대문': '서대문구',
    '광진': '광진구',
    '성북': '성북구',
    '용산': '용산구',
    '은평': '은평구',
    '금천': '금천구',
    '한강': '영등포구',
    '중구': '중구',
    '성동': '성동구',
    '북악산': '종로구',
    '구로': '구로구',
    '강북*': '강북구',
    '남현': '관악구',
    '관악': '관악구',
    '영등포': '영등포구',
    '현충원': '동작구'
}

In [112]:
df_weather['region'] = df_weather['region'].map(station_to_gu)

In [114]:
df_weather = df_weather.groupby(['datetime','region'],as_index=False).mean()

In [115]:
df_weather

Unnamed: 0,datetime,region,temperature,wind_direction,wind_speed,precipitation,humidity
0,2017-01-01 01:00:00,강남구,1.4,69.10,1.30,0.0,78.0000
1,2017-01-01 01:00:00,강동구,-0.2,358.90,1.30,0.0,88.0000
2,2017-01-01 01:00:00,강북구,1.4,338.80,0.50,0.0,81.0000
3,2017-01-01 01:00:00,강서구,2.9,116.90,1.80,0.0,63.5432
4,2017-01-01 01:00:00,관악구,1.1,178.25,1.75,0.0,78.2716
...,...,...,...,...,...,...,...
1419568,2023-12-31 00:00:00,영등포구,2.9,161.70,2.10,0.0,93.5000
1419569,2023-12-31 00:00:00,용산구,2.1,38.10,2.40,0.0,94.0000
1419570,2023-12-31 00:00:00,은평구,2.8,61.00,1.10,0.0,100.0000
1419571,2023-12-31 00:00:00,중구,0.4,176.40,4.70,0.0,100.0000


In [117]:
df_pollution

Unnamed: 0,id,datetime,region,no2,co,so2,o3
0,31704,2017-01-01 00:00:00,평균,0.0536,1.21,0.0054,0.0025
1,31705,2017-01-01 01:00:00,평균,0.0508,1.15,0.0053,0.0025
2,31706,2017-01-01 02:00:00,평균,0.0493,1.15,0.0051,0.0026
3,31707,2017-01-01 03:00:00,평균,0.0475,1.13,0.0049,0.0024
4,31708,2017-01-01 04:00:00,평균,0.0444,1.08,0.0047,0.0024
...,...,...,...,...,...,...,...
1576219,2352898,2023-12-31 19:00:00,중랑구,0.0310,0.67,0.0023,0.0142
1576220,2352899,2023-12-31 20:00:00,중랑구,0.0326,0.72,0.0022,0.0106
1576221,2352900,2023-12-31 21:00:00,중랑구,0.0352,0.76,0.0027,0.0087
1576222,2352901,2023-12-31 22:00:00,중랑구,0.0465,0.99,0.0026,0.0021


In [118]:
df_pollution.isna().sum()

id          0
datetime    0
region      0
no2         0
co          0
so2         0
o3          0
dtype: int64

In [119]:
df_pollution.drop(columns='id', inplace=True)

In [120]:
df_pollution

Unnamed: 0,datetime,region,no2,co,so2,o3
0,2017-01-01 00:00:00,평균,0.0536,1.21,0.0054,0.0025
1,2017-01-01 01:00:00,평균,0.0508,1.15,0.0053,0.0025
2,2017-01-01 02:00:00,평균,0.0493,1.15,0.0051,0.0026
3,2017-01-01 03:00:00,평균,0.0475,1.13,0.0049,0.0024
4,2017-01-01 04:00:00,평균,0.0444,1.08,0.0047,0.0024
...,...,...,...,...,...,...
1576219,2023-12-31 19:00:00,중랑구,0.0310,0.67,0.0023,0.0142
1576220,2023-12-31 20:00:00,중랑구,0.0326,0.72,0.0022,0.0106
1576221,2023-12-31 21:00:00,중랑구,0.0352,0.76,0.0027,0.0087
1576222,2023-12-31 22:00:00,중랑구,0.0465,0.99,0.0026,0.0021


In [121]:
df_vehicle

Unnamed: 0,id,datetime,region,fuel_type,use_type,reg_count
0,136,2017-01-01,강원,경유,사업용,12362.0
1,137,2017-01-01,광주,전기,비사업용,240.0
2,138,2017-01-01,인천,전기,사업용,6.0
3,139,2017-01-01,인천,전기,비사업용,210.0
4,140,2017-01-01,대구,전기,사업용,53.0
...,...,...,...,...,...,...
9931,10067,2023-12-01,경남,휘발유,사업용,52259.0
9932,10068,2023-12-01,경북,휘발유,사업용,1802.0
9933,10069,2023-12-01,대구,경유,사업용,24774.0
9934,10070,2023-12-01,부산,휘발유,사업용,53741.0


In [122]:
df_vehicle = df_vehicle.groupby(['datetime', 'region', 'fuel_type'], as_index=False)['reg_count'].sum()

In [123]:
df_vehicle

Unnamed: 0,datetime,region,fuel_type,reg_count
0,2017-01-01,강원,경유,342173.0
1,2017-01-01,강원,전기,157.0
2,2017-01-01,강원,휘발유,305859.0
3,2017-01-01,경기,경유,2190457.0
4,2017-01-01,경기,전기,707.0
...,...,...,...,...
4531,2023-12-01,충남,전기,24130.0
4532,2023-12-01,충남,휘발유,555127.0
4533,2023-12-01,충북,경유,372795.0
4534,2023-12-01,충북,전기,19972.0


In [124]:
df_vehicle['year'] = pd.to_datetime(df_vehicle['datetime']).dt.year
df_vehicle['month'] = pd.to_datetime(df_vehicle['datetime']).dt.month

In [125]:
from pandas.tseries.offsets import MonthEnd

df_vehicle['start_date'] = pd.to_datetime(df_vehicle['datetime'])  # 1일
df_vehicle['end_date'] = df_vehicle['start_date'] + MonthEnd(0)   # 말일

# 일자 리스트 생성
date_rows = []

for _, row in df_vehicle.iterrows():
    days = pd.date_range(start=row['start_date'], end=row['end_date'], freq='D')
    for day in days:
        date_rows.append({
            'date': day,
            'region': row['region'],
            'fuel_type': row['fuel_type'],
            'reg_count': row['reg_count'],
        })

daily_df = pd.DataFrame(date_rows)

In [126]:
vehicle_seoul = daily_df[daily_df['region'] == '서울'].copy()
vehicle_seoul

Unnamed: 0,date,region,fuel_type,reg_count
837,2017-01-01,서울,경유,1112698.0
838,2017-01-02,서울,경유,1112698.0
839,2017-01-03,서울,경유,1112698.0
840,2017-01-04,서울,경유,1112698.0
841,2017-01-05,서울,경유,1112698.0
...,...,...,...,...
137275,2023-12-27,서울,휘발유,1658706.0
137276,2023-12-28,서울,휘발유,1658706.0
137277,2023-12-29,서울,휘발유,1658706.0
137278,2023-12-30,서울,휘발유,1658706.0


In [None]:
hours = pd.date_range("00:00", "23:00", freq="h").time
hour_df = pd.DataFrame({'time': hours})

# 일 × 시 조합
expanded = vehicle_seoul.merge(hour_df, how='cross')
expanded['datetime'] = pd.to_datetime(expanded['date'].astype(str) + ' ' + expanded['time'].astype(str))

# 정리
expanded = expanded[['datetime', 'region', 'fuel_type', 'reg_count']].sort_values('datetime').reset_index(drop=True)

In [128]:
expanded

Unnamed: 0,datetime,region,fuel_type,reg_count
0,2017-01-01 00:00:00,서울,경유,1112698.0
1,2017-01-01 00:00:00,서울,전기,1589.0
2,2017-01-01 00:00:00,서울,휘발유,1600398.0
3,2017-01-01 01:00:00,서울,경유,1112698.0
4,2017-01-01 01:00:00,서울,전기,1589.0
...,...,...,...,...
184027,2023-12-31 22:00:00,서울,휘발유,1658706.0
184028,2023-12-31 22:00:00,서울,경유,1008457.0
184029,2023-12-31 23:00:00,서울,전기,72937.0
184030,2023-12-31 23:00:00,서울,경유,1008457.0


In [130]:
df_vehicle = expanded
df_vehicle.drop(['region'], axis=1,inplace=True)

In [132]:
df_pollution.to_csv("대기오염.csv",index=False)
df_weather.to_csv("날씨.csv",index=False)
df_quality.to_csv("대기질.csv",index=False)
df_vehicle.to_csv("차량.csv",index=False)

In [133]:
df1 = pd.read_csv("./대기오염.csv")
df2 = pd.read_csv("./날씨.csv")
df3 = pd.read_csv("./대기질.csv")
df4 = pd.read_csv("./차량.csv")

In [134]:
df_merged = pd.merge(df1, df2)
df_merged = pd.merge(df_merged, df3)

In [135]:
df_merged = pd.merge(df_merged,df4)

In [136]:
df = df_merged
df

Unnamed: 0,datetime,region,no2,co,so2,o3,temperature,wind_direction,wind_speed,precipitation,humidity,id,pm10,pm25,fuel_type,reg_count
0,2017-01-01 01:00:00,강남구,0.0400,0.80,0.0050,0.0020,1.4,69.1,1.3,0.0,78.0,1590571,63.0,48.0,경유,1112698.0
1,2017-01-01 01:00:00,강남구,0.0400,0.80,0.0050,0.0020,1.4,69.1,1.3,0.0,78.0,1590571,63.0,48.0,전기,1589.0
2,2017-01-01 01:00:00,강남구,0.0400,0.80,0.0050,0.0020,1.4,69.1,1.3,0.0,78.0,1590571,63.0,48.0,휘발유,1600398.0
3,2017-01-01 02:00:00,강남구,0.0380,0.80,0.0050,0.0020,1.2,66.2,1.6,0.0,80.0,1590545,61.0,44.0,경유,1112698.0
4,2017-01-01 02:00:00,강남구,0.0380,0.80,0.0050,0.0020,1.2,66.2,1.6,0.0,80.0,1590545,61.0,44.0,전기,1589.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4206595,2023-12-30 23:00:00,중랑구,0.0215,0.43,0.0023,0.0213,2.3,350.8,1.7,0.0,93.0,2503433,32.0,23.0,전기,72937.0
4206596,2023-12-30 23:00:00,중랑구,0.0215,0.43,0.0023,0.0213,2.3,350.8,1.7,0.0,93.0,2503433,32.0,23.0,경유,1008457.0
4206597,2023-12-31 00:00:00,중랑구,0.0270,0.51,0.0025,0.0170,2.5,0.0,0.4,0.0,94.0,2503407,29.0,23.0,전기,72937.0
4206598,2023-12-31 00:00:00,중랑구,0.0270,0.51,0.0025,0.0170,2.5,0.0,0.4,0.0,94.0,2503407,29.0,23.0,경유,1008457.0


In [137]:
df.isna().sum()

datetime          0
region            0
no2               0
co                0
so2               0
o3                0
temperature       0
wind_direction    0
wind_speed        0
precipitation     0
humidity          0
id                0
pm10              0
pm25              0
fuel_type         0
reg_count         0
dtype: int64

In [138]:
df.to_csv("./대기오염물질_데이터셋.csv",index=False)