In [1]:
import pandas as pd
from google.colab import files
uploaded = files.upload()

Saving bus_dust.csv to bus_dust.csv


In [3]:
# 파일 경로 설정
file_path = 'bus_dust.csv'

# CSV 파일 로드 (인코딩 지정)
bus = pd.read_csv(file_path, encoding='cp949')


In [4]:
bus

Unnamed: 0,일시,구분,미세먼지(PM10),초미세먼지(PM2.5)
0,2022-12-31 23:00,평균,59.0,46.0
1,2022-12-31 23:00,강남구,57.0,44.0
2,2022-12-31 23:00,강동구,68.0,55.0
3,2022-12-31 23:00,강북구,59.0,42.0
4,2022-12-31 23:00,강서구,62.0,40.0
...,...,...,...,...
227755,2022-01-01 0:00,용산구,22.0,11.0
227756,2022-01-01 0:00,은평구,19.0,9.0
227757,2022-01-01 0:00,종로구,20.0,9.0
227758,2022-01-01 0:00,중구,20.0,11.0


In [7]:
# 2) 불필요한 컬럼 제거
bus = bus.drop(columns=['초미세먼지(PM2.5)'])

# 3) 컬럼명 변경
bus = bus.rename(columns={'구분': '지역'})

# 4) '일시'를 datetime 타입으로 변환
bus['일시'] = pd.to_datetime(bus['일시'], format='%Y-%m-%d %H:%M')

# 5) 월(month) 추출 및 계절 매핑
bus['계절'] = bus['일시'].dt.month.map(lambda m:
    'spring'   if 3 <= m <= 5 else
    'summer' if 6 <= m <= 8 else
    'fall' if 9 <= m <= 11 else
    'winter'
)
bus = bus.drop(columns=['일시'])

KeyError: "['초미세먼지(PM2.5)'] not found in axis"

In [8]:
bus = bus.drop(columns=['일시'])

In [9]:
bus

Unnamed: 0,지역,미세먼지(PM10),계절
0,평균,59.0,winter
1,강남구,57.0,winter
2,강동구,68.0,winter
3,강북구,59.0,winter
4,강서구,62.0,winter
...,...,...,...
227755,용산구,22.0,winter
227756,은평구,19.0,winter
227757,종로구,20.0,winter
227758,중구,20.0,winter


In [13]:
# 6) '지역'에서 '평균' 값 제거
bus = bus[bus['지역'] != '평균']

# 7) '지역' & '계절'별로 '미세먼지(PM10)' 합계, 개수, 평균 계산
df_stats = (
    bus
    .groupby(['지역', '계절'])['미세먼지(PM10)']
    .agg(총합='sum', 개수='count')
    .reset_index()
)
df_stats['평균미세먼지'] = df_stats['총합'] / df_stats['개수']


In [14]:
df_stats

Unnamed: 0,지역,계절,총합,개수,평균미세먼지
0,강남구,fall,62941.0,2169,29.018442
1,강남구,spring,82821.0,2197,37.697315
2,강남구,summer,50501.0,2183,23.133761
3,강남구,winter,82847.0,2136,38.786049
4,강동구,fall,61711.0,2133,28.931552
...,...,...,...,...,...
95,중구,winter,83903.0,2140,39.207009
96,중랑구,fall,59342.0,2161,27.460435
97,중랑구,spring,85274.0,2192,38.902372
98,중랑구,summer,51792.0,2182,23.736022


In [15]:
# 1) 총합을 개수로 나눠 평균 컬럼 추가
df_stats['평균미세먼지'] = df_stats['총합'] / df_stats['개수']

# 2) 필요하다면 '총합'과 '개수' 컬럼을 제거하고 평균만 남기기
df_avg_only = df_stats[['지역', '계절', '평균미세먼지']]


In [16]:
df_avg_only

Unnamed: 0,지역,계절,평균미세먼지
0,강남구,fall,29.018442
1,강남구,spring,37.697315
2,강남구,summer,23.133761
3,강남구,winter,38.786049
4,강동구,fall,28.931552
...,...,...,...
95,중구,winter,39.207009
96,중랑구,fall,27.460435
97,중랑구,spring,38.902372
98,중랑구,summer,23.736022


In [11]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/250.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m153.6/250.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.9/250.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [17]:
import pandas as pd

# 1) df_stats_m9을 엑셀 파일로 저장할 경로 지정
output_path = 'df_avg_only.xlsx'

# 2) 엑셀로 저장 (index=False로 인덱스 컬럼 제외)
df_avg_only.to_excel(output_path, index=False)

print(f"저장 완료: {output_path}")

저장 완료: df_avg_only.xlsx


In [18]:
import pandas as pd
import json
import numpy as np

In [19]:
uploaded = files.upload()

Saving confuse.csv to confuse.csv


In [29]:
# 1) 기존 혼잡도 데이터 로드
confuse = pd.read_csv('confuse.csv', encoding='cp949')

# 2) 시간대 컬럼 리스트 추출 (6번째 이후)
time_cols = confuse.columns[6:].tolist()

In [30]:
confuse

Unnamed: 0,연번,요일구분,호선,역번호,출발역,상하구분,5시30분,6시00분,6시30분,7시00분,...,20시00분,20시30분,21시00분,21시30분,22시00분,22시30분,23시00분,23시30분,00시00분,00시30분
0,1,평일,1,158,청량리,상선,7.2,6.9,4.5,8.3,...,24.8,26.1,28.2,24.5,23.0,22.2,21.7,14.9,8.5,0.0
1,2,평일,1,157,제기동,상선,7.6,8.7,6.5,8.7,...,30.0,26.0,34.8,27.5,25.7,25.4,24.2,16.8,11.6,0.0
2,3,평일,1,156,신설동,상선,6.7,11.2,7.2,9.6,...,30.7,26.8,36.3,28.6,26.6,26.1,25.2,16.1,12.6,0.0
3,4,평일,1,159,동묘앞,상선,6.3,11.8,7.4,12.2,...,32.1,30.1,41.8,29.9,29.0,23.5,27.7,13.5,14.3,0.0
4,5,평일,1,155,동대문,상선,7.4,11.2,8.3,14.0,...,35.6,33.5,40.5,34.7,32.6,26.1,31.3,18.0,13.6,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1657,1658,일요일,8,2823,남한산성입구,하선,4.6,4.2,4.3,3.8,...,13.1,13.6,14.8,12.7,12.2,11.4,7.6,5.2,0.0,
1658,1659,일요일,8,2824,단대오거리,하선,5.6,3.1,3.9,3.2,...,6.7,7.0,7.5,6.6,5.8,5.6,3.7,2.7,0.0,
1659,1660,일요일,8,2825,신흥,하선,6.2,2.7,3.6,3.2,...,4.2,4.3,4.0,4.2,3.3,3.5,2.4,1.5,0.0,
1660,1661,일요일,8,2826,수진,하선,8.8,3.8,3.7,3.5,...,2.1,2.5,1.8,1.7,1.7,1.5,0.6,0.6,0.1,


In [31]:
time_cols

['5시30분',
 '6시00분',
 '6시30분',
 '7시00분',
 '7시30분',
 '8시00분',
 '8시30분',
 '9시00분',
 '9시30분',
 '10시00분',
 '10시30분',
 '11시00분',
 '11시30분',
 '12시00분',
 '12시30분',
 '13시00분',
 '13시30분',
 '14시00분',
 '14시30분',
 '15시00분',
 '15시30분',
 '16시00분',
 '16시30분',
 '17시00분',
 '17시30분',
 '18시00분',
 '18시30분',
 '19시00분',
 '19시30분',
 '20시00분',
 '20시30분',
 '21시00분',
 '21시30분',
 '22시00분',
 '22시30분',
 '23시00분',
 '23시30분',
 '00시00분',
 '00시30분']

In [22]:
uploaded = files.upload()

Saving metro_station.json to metro_station.json


In [32]:
# 3) 지하철역 정보 로드
with open('metro_station.json', 'r', encoding='utf-8') as f:
    metro = json.load(f)['DATA']
stations_df = pd.DataFrame(metro)


In [33]:
confuse_drop = confuse.drop(columns=['연번'])

In [34]:
# 4) 추가할 노선 및 특별 노선 설정
new_lines = ['9호선','경의중앙선','경춘선','수인분당선','신림선','신분당선','우이신설선']
special = {'경의중앙선','신분당선','우이신설선'}

# 5) 요일 및 방향 설정
days = ['평일', '토요일', '일요일']
directions = ['상선', '하선']


In [35]:
# 6) 임의 데이터 생성 (각 역당 6개 레코드)
new_records = []
for line in new_lines:
    subset = stations_df[stations_df['line'] == line]
    for _, row in subset.iterrows():
        station_cd = row['station_cd']
        station_name = row['name']
        for day in days:
            for direction in directions:
                rec = {
                    '요일구분': day,
                    '호선': line,
                    '역번호': station_cd,
                    '출발역': station_name,
                    '상하구분': direction
                }
                # 시간대별 혼잡도 생성
                for tc in time_cols:
                    base = confuse_drop[tc].mean()
                    if line in special:
                        val = np.random.normal(loc=base * 1.1, scale=5)
                    else:
                        val = np.random.normal(loc=base, scale=5)
                    rec[tc] = round(max(val, 0), 1)
                new_records.append(rec)

# 7) 새 레코드 DataFrame 생성 및 병합
new_df = pd.DataFrame(new_records)
confuse_extended = pd.concat([confuse_drop, new_df], ignore_index=True)


In [36]:
confuse_extended

Unnamed: 0,요일구분,호선,역번호,출발역,상하구분,5시30분,6시00분,6시30분,7시00분,7시30분,...,20시00분,20시30분,21시00분,21시30분,22시00분,22시30분,23시00분,23시30분,00시00분,00시30분
0,평일,1,158,청량리,상선,7.2,6.9,4.5,8.3,10.4,...,24.8,26.1,28.2,24.5,23.0,22.2,21.7,14.9,8.5,0.0
1,평일,1,157,제기동,상선,7.6,8.7,6.5,8.7,12.9,...,30.0,26.0,34.8,27.5,25.7,25.4,24.2,16.8,11.6,0.0
2,평일,1,156,신설동,상선,6.7,11.2,7.2,9.6,14.8,...,30.7,26.8,36.3,28.6,26.6,26.1,25.2,16.1,12.6,0.0
3,평일,1,159,동묘앞,상선,6.3,11.8,7.4,12.2,17.7,...,32.1,30.1,41.8,29.9,29.0,23.5,27.7,13.5,14.3,0.0
4,평일,1,155,동대문,상선,7.4,11.2,8.3,14.0,21.7,...,35.6,33.5,40.5,34.7,32.6,26.1,31.3,18.0,13.6,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,평일,우이신설선,4702,솔밭공원,하선,22.4,19.4,15.9,20.5,33.5,...,30.3,28.5,41.3,36.3,26.8,16.3,21.6,18.8,10.0,6.5
2996,토요일,우이신설선,4702,솔밭공원,상선,15.0,17.4,7.3,22.4,25.8,...,36.6,29.2,27.2,27.7,38.3,25.5,17.5,18.8,5.7,3.1
2997,토요일,우이신설선,4702,솔밭공원,하선,21.6,14.7,13.4,21.1,26.2,...,25.6,29.4,28.7,29.2,29.2,32.5,7.9,13.8,5.2,4.8
2998,일요일,우이신설선,4702,솔밭공원,상선,7.0,17.2,19.3,21.8,37.0,...,28.3,31.6,27.0,21.2,27.1,24.0,25.9,7.7,2.8,10.7


In [37]:
# 1) df_stats_m9을 엑셀 파일로 저장할 경로 지정
output_path = 'confuse_extended.xlsx'

# 2) 엑셀로 저장 (index=False로 인덱스 컬럼 제외)
confuse_extended.to_excel(output_path, index=False)

print(f"저장 완료: {output_path}")

저장 완료: confuse_extended.xlsx


In [48]:
uploaded = files.upload()

Saving metro_late.csv to metro_late.csv


In [49]:
# 1) 기존 지연 데이터 로드
with open('metro_late.json', 'r', encoding='utf-8') as f:
    late_data = json.load(f)
df_late = pd.DataFrame(late_data)

In [50]:
# 2) 기본 정보 확인
print(df_late.shape)          # 행·열 개수
print(df_late.columns)        # 컬럼명 목록
print(df_late.dtypes)         # 각 컬럼의 데이터 타입
print(df_late.head())         # 상위 5개 샘플


(2056, 4)
Index(['지연일자', '노선', '지연시간대', '최대지연시간(분)'], dtype='object')
지연일자         object
노선           object
지연시간대        object
최대지연시간(분)     int64
dtype: object
         지연일자   노선   지연시간대  최대지연시간(분)
0  2023-09-01  2호선  첫차~09시         10
1  2023-09-01  2호선  첫차~09시          5
2  2023-09-01  1호선  첫차~09시         15
3  2023-09-01  1호선  첫차~09시         15
4  2023-09-01  4호선  첫차~09시         10


In [40]:
# 2) 고유한 (지연일자, 지연시간대) 조합 추출
slots = df_late[['지연일자', '지연시간대']].drop_duplicates()

# 3) 추가할 노선 및 '자주 늦는' 노선 설정
new_lines = ['9호선','경의중앙선','경춘선','수인분당선','신림선','신분당선','우이신설선']
more_often = {'경의중앙선','수인분당선','신림선'}  # 20% 확률로 한번 더 지연 발생


In [41]:
# 4) 시간대별 평균 지연시간 계산 (기존 데이터 기반)
avg_delay_by_slot = df_late.groupby('지연시간대')['최대지연시간(분)'].mean().to_dict()

# 5) 신규 레코드 생성
new_records = []
for line in new_lines:
    for _, slot in slots.iterrows():
        date = slot['지연일자']
        time_range = slot['지연시간대']
        # 기본 지연시간: 평균에 약간의 노이즈 추가
        base = avg_delay_by_slot.get(time_range, df_late['최대지연시간(분)'].mean())
        delay = max(1, int(np.random.normal(loc=base, scale=2)))
        new_records.append({
            '지연일자': date,
            '노선': line,
            '지연시간대': time_range,
            '최대지연시간(분)': delay
        })
        # '자주 늦는' 노선은 20% 확률로 추가 지연 발생
        if line in more_often and np.random.rand() < 0.2:
            extra_delay = max(1, int(np.random.normal(loc=base, scale=2)))
            new_records.append({
                '지연일자': date,
                '노선': line,
                '지연시간대': time_range,
                '최대지연시간(분)': extra_delay
            })

# 6) 병합 및 결과 예시
df_new = pd.DataFrame(new_records)
df_extended = pd.concat([df_late, df_new], ignore_index=True)

In [42]:
df_extended

Unnamed: 0,지연일자,노선,지연시간대,최대지연시간(분)
0,2023-09-01,2호선,첫차~09시,10
1,2023-09-01,2호선,첫차~09시,5
2,2023-09-01,1호선,첫차~09시,15
3,2023-09-01,1호선,첫차~09시,15
4,2023-09-01,4호선,첫차~09시,10
...,...,...,...,...
5495,2024-08-26,우이신설선,첫차~09시,10
5496,2024-08-27,우이신설선,첫차~09시,5
5497,2024-08-28,우이신설선,첫차~09시,11
5498,2024-08-29,우이신설선,첫차~09시,10


In [43]:
# 1) df_stats_m9을 엑셀 파일로 저장할 경로 지정
output_path = 'df_extended.xlsx'

# 2) 엑셀로 저장 (index=False로 인덱스 컬럼 제외)
df_extended.to_excel(output_path, index=False)

print(f"저장 완료: {output_path}")

저장 완료: df_extended.xlsx


In [44]:
uploaded = files.upload()

Saving bus_remaining.csv to bus_remaining.csv


In [46]:
# 버스 잔여석 데이터 로드 (utf-8 인코딩으로 시도)
bus_remain = pd.read_csv('bus_remaining.csv', encoding='utf-8')

# 1) 컬럼명 및 데이터 타입 정보 생성
col_info = pd.DataFrame({
    '컬럼명': bus_remain.columns,
    '데이터타입': [str(bus_remain[col].dtype) for col in bus_remain.columns]
})


  bus_remain = pd.read_csv('bus_remaining.csv', encoding='utf-8')


In [47]:
bus_remain

Unnamed: 0,노선번호,노선명,표준버스정류장ID,버스정류장ARS번호,역명,season,00시잔여승객수,1시잔여승객수,2시잔여승객수,3시잔여승객수,...,14시잔여승객수,15시잔여승객수,16시잔여승객수,17시잔여승객수,18시잔여승객수,19시잔여승객수,20시잔여승객수,21시잔여승객수,22시잔여승객수,23시잔여승객수
0,470,470번(상암차고지~안골마을),100000001,1001,종로2가사거리(00066),winter,0.0,0.0,0.0,0.000000,...,580.0,478.5,363.5,187.5,0.0,0.0,40.0,110.5,143.500000,201.000000
1,N37,N37번(진관공영차고지~송파공영차고지),100000001,1001,종로2가사거리(00089),winter,0.0,0.0,38.0,147.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,215.333333,215.333333
2,470,470번(상암차고지~안골마을),100000001,1001,종로2가사거리(00064),winter,0.0,0.0,0.0,0.000000,...,786.0,692.0,550.0,421.0,237.0,154.0,242.0,369.0,433.000000,511.000000
3,741,741번(진관차고지~헌인릉입구),100000001,1001,종로2가사거리(00073),winter,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,65.0,157.0,178.000000,210.000000
4,N37,N37번(송파공영차고지~진관공영차고지),100000001,1001,종로2가사거리(00032),winter,0.0,0.0,29.0,30.666667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.666667,30.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180887,3323,3323번(강동차고지~중앙보훈병원역),227000488,28100,미사강변더샵센트럴포레.동원로얄듀크(00008),summer,0.0,0.0,0.0,0.000000,...,458.0,506.0,655.0,876.0,1295.0,1394.0,1473.0,1503.0,1542.000000,1547.000000
180888,3323,3323번(강동차고지~중앙보훈병원역),227000632,28439,미사강변더샵센트럴포레.동원로얄듀크(00039),summer,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
180889,101,101번(화계사~동대문),998501983,~,동아운수종점(종점가상)(00084),summer,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000
180890,6633,6633번(강서공영차고지~여의도역),998502032,~,강서공영차고지(종점가상)(00100),summer,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000


In [1]:
import pandas as pd
from google.colab import files
uploaded = files.upload()

Saving S-DoT_Nature_202406-202505-14.csv to S-DoT_Nature_202406-202505-14.csv


In [2]:
# 파일 경로 설정
file_path = 'S-DoT_Nature_202406-202505-14.csv'

# CSV 파일 로드 (인코딩 지정)
a = pd.read_csv(file_path, encoding='cp949')


In [3]:
a

Unnamed: 0,모델명,시리얼,측정시간,지역,자치구,행정동,온도 최대 (℃),온도 평균 (℃),온도 최소 (℃),습도 최대 (%),...,암모니아 최대 (ppm),암모니아 평균 (ppm),암모니아 최소 (ppm),황화수소 최대 (ppm),황화수소 평균 (ppm),황화수소 최소 (ppm),오존 최대 (ppm),오존 평균 (ppm),오존 최소 (ppm),등록 일시
0,SDOT001,OC3CL200022,2024-06-10_12:07:00,main_street,Gangnam-gu,Daechi4-dong,23.5,23.2,23.0,100.0,...,,,,,,,,,,2024-06-10 0:07
1,SDOT001,OC3CL200011,2024-06-10_12:07:00,parks,Seoul_Grand_Park,valet_parking1,19.2,18.9,18.4,100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,2024-06-10 0:07
2,SDOT001,OC3CL200029,2024-06-10_12:07:00,main_street,Gwangjin-gu,Guui1-dong,24.0,23.7,23.5,100.0,...,,,,,,,,,,2024-06-10 0:07
3,SDOT001,OC3CL200016,2024-06-10_12:07:00,main_street,Gangnam-gu,Apgujeong-dong,23.7,23.5,23.4,61.0,...,,,,,,,,,,2024-06-10 0:07
4,SDOT001,OC3CL200021,2024-06-10_12:07:00,main_street,Gangnam-gu,Sinsa-dong,23.8,23.6,23.4,61.0,...,,,,,,,,,,2024-06-10 0:07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176259,SDOT001,V02Q1940389,2024-06-16_11:07:00,residential_area,Dongjak-gu,Sindaebang2-dong,23.6,23.4,23.1,58.0,...,,,,,,,,,,2024-06-16 23:08
176260,SDOT001,V02Q1940834,2024-06-16_11:07:00,residential_area,Dongjak-gu,Sadang4-dong,23.7,23.4,23.1,54.0,...,,,,,,,,,,2024-06-16 23:08
176261,SDOT001,V02Q1940301,2024-06-16_11:07:00,traditional_markets,Dongjak-gu,Sadang2-dong,24.5,24.1,23.8,54.0,...,,,,,,,,,,2024-06-16 23:08
176262,SDOT001,V02Q1940605,2024-06-16_11:07:00,residential_area,Mapo-gu,Seogyo-dong,25.7,25.4,25.1,48.0,...,,,,,,,,,,2024-06-16 23:08


In [10]:
import pandas as pd

# 1) 전체 행 개수
n_rows = len(a)

# 2) 결측치 비율 계산
#    각 컬럼별로 결측치 개수를 세고, 비율이 0.7 초과인 컬럼 리스트 생성
cols_to_drop = [col for col in a.columns
                if a[col].isna().sum() / n_rows > 0.8]

print(f"제거 대상 컬럼 ({len(cols_to_drop)}개):", cols_to_drop)

# 3) 컬럼 제거
a_clean = a.drop(columns=cols_to_drop)

# 4) 결과 확인
print("제거 후 DataFrame 크기:", a_clean.shape)


제거 대상 컬럼 (0개): []
제거 후 DataFrame 크기: (137822, 26)


In [11]:
import pandas as pd
import math

# —————————————————————
# 1) 70% 이상 결측치인 컬럼 제거
# (이전 단계에서 수행하지 않았다면)
# —————————————————————
n_rows = len(a)
cols_high_na = [col for col in a.columns
                if a[col].isna().sum() / n_rows > 0.7]
print(f"▶ 70% 이상 결측치로 제거할 컬럼 ({len(cols_high_na)}개):", cols_high_na)
a = a.drop(columns=cols_high_na)

# —————————————————————
# 2) 지정된 메타컬럼 제거
# —————————————————————
to_drop = ['모델명', '시리얼', '지역', '행정동', '등록 일시']
# (컬럼명이 정확히 일치하는지 확인하면서 제거)
existing = [c for c in to_drop if c in a.columns]
print(f"▶ 메타컬럼 제거: {existing}")
a = a.drop(columns=existing)

# —————————————————————
# 3) 행별 결측치 비율 > 30%인 행 제거
# —————————————————————
n_cols = a.shape[1]
# 최소 non-NA 개수 계산 (NA 비율 ≤ 0.3 허용)
min_non_na = math.ceil((1 - 0.2) * n_cols)
print(f"▶ 남아있어야 할 non-NA 최소 값: {min_non_na} (총 컬럼 {n_cols}개 중)")
a = a.dropna(thresh=min_non_na, axis=0)

# —————————————————————
# 4) 최종 결과 확인
# —————————————————————
print("▶ 최종 DataFrame 크기:", a.shape)


▶ 70% 이상 결측치로 제거할 컬럼 (0개): []
▶ 메타컬럼 제거: []
▶ 남아있어야 할 non-NA 최소 값: 21 (총 컬럼 26개 중)
▶ 최종 DataFrame 크기: (127217, 26)


In [5]:
a1

['모델명',
 '시리얼',
 '측정시간',
 '지역',
 '자치구',
 '행정동',
 '온도 최대 (℃)',
 '온도 평균 (℃)',
 '온도 최소 (℃)',
 '습도 최대 (%)',
 '습도 평균 (%)',
 '습도 최소 (%)',
 '풍속 최대 (m/s)',
 '풍속 평균 (m/s)',
 '풍속 최소 (m/s)',
 '풍향 최대 (m/s)',
 '풍향 평균 (m/s)',
 '풍향 최소 (m/s)',
 '조도 최대 (lux)',
 '조도 평균 (lux)',
 '조도 최소 (lux)',
 '자외선 최대 (UV)',
 '자외선 평균 (UV)',
 '자외선 최대 (UV).1',
 '소음 최대 (dB )',
 '소음 평균 (dB )',
 '소음 최소 (dB )',
 '진동_x 최대 (mm/s)',
 '진동_x 평균(mm/s)',
 '진동_x 최소(mm/s)',
 '진동_y 최대(mm/s)',
 '진동_y 평균(mm/s)',
 '진동_y 최소(mm/s)',
 '진동_z 최대(mm/s)',
 '진동_z 평균(mm/s)',
 '진동_z 최소(mm/s)',
 '흑구온도 최대 (℃)',
 '흑구온도 평균 (℃)',
 '흑구온도 최소 (℃)',
 '이산화질소 최대 (ppm)',
 '이산화질소 평균 (ppm)',
 '이산화질소 최소 (ppm)',
 '일산화탄소 최대 (ppm)',
 '일산화탄소 평균 (ppm)',
 '일산화탄소 최소 (ppm)',
 '이산화황 최대 (ppm)',
 '이산화황 평균 (ppm)',
 '이산화황 최소 (ppm)',
 '암모니아 최대 (ppm)',
 '암모니아 평균 (ppm)',
 '암모니아 최소 (ppm)',
 '황화수소 최대 (ppm)',
 '황화수소 평균 (ppm)',
 '황화수소 최소 (ppm)',
 '오존 최대 (ppm)',
 '오존 평균 (ppm)',
 '오존 최소 (ppm)',
 '등록 일시']

In [2]:
import pandas as pd
from google.colab import files
uploaded = files.upload()

Saving weather_water.csv to weather_water.csv
Saving metro_dust.xlsx to metro_dust.xlsx
Saving bus_dust.xlsx to bus_dust.xlsx
Saving weather_seasons.json to weather_seasons.json


In [4]:
!pip install openpyxl

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/250.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━[0m [32m153.6/250.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.9/250.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [5]:
# Load datasets
weather_seasons = pd.read_json('weather_seasons.json')
bus_dust = pd.read_excel('bus_dust.xlsx')
metro_dust = pd.read_excel('metro_dust.xlsx')
weather_water = pd.read_csv('weather_water.csv')

In [6]:
weather_seasons

Unnamed: 0,observatory_id,region_id,nam,address,lng,lat,spr_temp,season,sum_temp,fal_temp,win_temp
0,509.0,1,관악,서울특별시 관악구 신림동 산56-1 (서울대학교),126.950219,37.452863,11.438756,spring,,,
1,417.0,2,금천,서울특별시 금천구 독산동 1034 (독산초등학교),126.914121,37.460314,12.775027,spring,,,
2,401.0,3,서초,서울특별시 서초구 서초동 1650 (서울교육대학교),127.017913,37.482814,13.478883,spring,,,
3,423.0,4,구로,서울특별시 구로구 궁동 213-42 (수궁동사무소),126.831229,37.486113,12.217729,spring,,,
4,410.0,5,기상청,서울특별시 동작구 신대방동 460-18 (기상청),126.920722,37.496314,12.915244,spring,,,
...,...,...,...,...,...,...,...,...,...,...,...
103,,23,북한산,서울특별시 종로구 구기동 산1 (승가사),126.954419,37.618295,,winter,,,-2.605182
104,,24,노원,서울특별시 노원구 공릉동 230-3 (육군사관학교),127.087304,37.622196,,winter,,,-0.941102
105,,25,강북,서울특별시 강북구 수유동 192-49 (강북구청 본관),126.999613,37.636093,,winter,,,0.434751
106,,26,도봉,서울특별시 도봉구 방학동 310 (신방학초등학교),127.033110,37.666091,,winter,,,-0.652582


In [7]:
weather_water

Unnamed: 0,자치구,계절,온도 최대(℃),온도 평균(℃),온도 최소(℃),습도 최대(%),습도 평균(%),습도 최소(%)
0,강남구,fall,18.68,18.28,17.90,71.22,69.62,67.95
1,강남구,spring,14.70,14.22,13.77,56.59,54.91,53.13
2,강남구,summer,28.65,28.23,27.84,78.92,77.45,75.92
3,강남구,winter,1.45,1.08,0.73,55.07,53.46,51.78
4,강동구,fall,18.04,17.60,17.19,75.33,73.71,72.02
...,...,...,...,...,...,...,...,...
95,중구,winter,1.59,1.23,0.89,53.22,51.68,50.03
96,중랑구,fall,18.85,18.40,17.97,68.07,66.53,64.92
97,중랑구,spring,15.07,14.56,14.08,56.54,54.90,53.20
98,중랑구,summer,28.92,28.46,28.02,75.47,73.82,72.14


In [9]:
import pandas as pd

# Load data
ws = pd.read_json('weather_seasons.json')
ww = pd.read_csv('weather_water.csv', encoding='utf-8')

# Select only the required columns from weather_seasons
ws_sel = ws[['nam','lng','lat','spr_temp','season','sum_temp','fal_temp','win_temp']]

# Select only the required columns from weather_water
ww_sel = ww[['자치구','계절','습도 평균(%)']]

# Rename columns for merging
ww_sel = ww_sel.rename(columns={
    '자치구': 'nam',
    '계절': 'season',
    '습도 평균(%)': 'avg_humidity'
})


In [10]:
ws_sel

Unnamed: 0,nam,lng,lat,spr_temp,season,sum_temp,fal_temp,win_temp
0,관악,126.950219,37.452863,11.438756,spring,,,
1,금천,126.914121,37.460314,12.775027,spring,,,
2,서초,127.017913,37.482814,13.478883,spring,,,
3,구로,126.831229,37.486113,12.217729,spring,,,
4,기상청,126.920722,37.496314,12.915244,spring,,,
...,...,...,...,...,...,...,...,...
103,북한산,126.954419,37.618295,,winter,,,-2.605182
104,노원,127.087304,37.622196,,winter,,,-0.941102
105,강북,126.999613,37.636093,,winter,,,0.434751
106,도봉,127.033110,37.666091,,winter,,,-0.652582


In [11]:
ww_sel

Unnamed: 0,nam,season,avg_humidity
0,강남구,fall,69.62
1,강남구,spring,54.91
2,강남구,summer,77.45
3,강남구,winter,53.46
4,강동구,fall,73.71
...,...,...,...
95,중구,winter,51.68
96,중랑구,fall,66.53
97,중랑구,spring,54.90
98,중랑구,summer,73.82


In [14]:
import pandas as pd

# 1) Load the data
ws = pd.read_json('weather_seasons.json')
ww = pd.read_csv('weather_water.csv', encoding='utf-8')

# 2) Prepare weather_seasons: select and compute avg_temp
ws_sel = ws[['nam','lng','lat','spr_temp','season','sum_temp','fal_temp','win_temp']].copy()
ws_sel['avg_temp'] = ws_sel[['spr_temp','sum_temp','fal_temp','win_temp']].sum(axis=1, skipna=True)
ws_tmp = ws_sel[['nam','season','avg_temp','lng','lat']]

# 3) Prepare weather_water: select, rename, and strip '구' suffix for matching
ww_sel = (
    ww[['자치구','계절','습도 평균(%)']]
    .rename(columns={'자치구':'nam','계절':'season','습도 평균(%)':'avg_humidity'})
    .copy()
)
ww_sel['nam'] = ww_sel['nam'].str.replace('구$','', regex=True)

# 4) Merge on nam and season
merged = ww_sel.merge(ws_tmp, on=['nam','season'], how='left')

# 5) Display the first few rows
merged.head()



Unnamed: 0,nam,season,avg_humidity,avg_temp,lng,lat
0,강남,fall,69.62,15.489523,,
1,강남,spring,54.91,13.168175,127.046412,37.512208
2,강남,summer,77.45,25.055699,127.046412,37.512208
3,강남,winter,53.46,0.69474,127.046412,37.512208
4,강동,fall,73.71,14.474759,,


In [15]:
merged.to_excel('merged.xlsx', index=False)

print("Saved to merged.xlsx")

Saved to merged.xlsx


In [16]:
import pandas as pd
from google.colab import files
uploaded = files.upload()

Saving metro_people_1.csv to metro_people_1.csv
Saving metro_people_2.csv to metro_people_2.csv
Saving metro_people_4.csv to metro_people_4.csv
Saving metro_people_3.csv to metro_people_3.csv
Saving metro_people_6.csv to metro_people_6.csv
Saving metro_people_5.csv to metro_people_5.csv
Saving metro_people_7.csv to metro_people_7.csv
Saving metro_people_8.csv to metro_people_8.csv
Saving metro_people_9.csv to metro_people_9.csv
Saving metro_people_10.csv to metro_people_10.csv
Saving metro_people_11.csv to metro_people_11.csv
Saving metro_people_12.csv to metro_people_12.csv


In [69]:
# 2) 모든 파일 읽어서 하나로 합치기
df_list = []
for fn in uploaded.keys():
    df_list.append(pd.read_csv(fn, encoding='cp949'))  # 인코딩이 다르면 'cp949' 등으로 변경
df = pd.concat(df_list, ignore_index=True)


  df_list.append(pd.read_csv(fn, encoding='cp949'))  # 인코딩이 다르면 'cp949' 등으로 변경
  df_list.append(pd.read_csv(fn, encoding='cp949'))  # 인코딩이 다르면 'cp949' 등으로 변경
  df_list.append(pd.read_csv(fn, encoding='cp949'))  # 인코딩이 다르면 'cp949' 등으로 변경
  df_list.append(pd.read_csv(fn, encoding='cp949'))  # 인코딩이 다르면 'cp949' 등으로 변경
  df_list.append(pd.read_csv(fn, encoding='cp949'))  # 인코딩이 다르면 'cp949' 등으로 변경
  df_list.append(pd.read_csv(fn, encoding='cp949'))  # 인코딩이 다르면 'cp949' 등으로 변경
  df_list.append(pd.read_csv(fn, encoding='cp949'))  # 인코딩이 다르면 'cp949' 등으로 변경


In [70]:
df

Unnamed: 0,사용년월,노선번호,노선명,표준버스정류장ID,버스정류장ARS번호,역명,00시승차총승객수,00시하차총승객수,1시승차총승객수,1시하차총승객수,...,20시하차총승객수,21시승차총승객수,21시하차총승객수,22시승차총승객수,22시하차총승객수,23시승차총승객수,23시하차총승객수,교통수단타입코드,교통수단타입명,등록일자
0,202401,470,470번(상암차고지~안골마을),100000001,1001,종로2가사거리(00066),63,99,0,0,...,261,400,228,327,155,240,125,10,서울간선버스,20240203
1,202401,N37,N37번(진관공영차고지~송파공영차고지),100000001,1001,종로2가사거리(00089),0,0,0,0,...,0,0,0,0,0,0,0,51,서울심야버스,20240203
2,202401,N37,N37번(송파공영차고지~진관공영차고지),100000001,1001,종로2가사거리(00032),8,8,187,195,...,0,0,0,0,0,0,0,51,서울심야버스,20240203
3,202401,741,741번(진관차고지~헌인릉입구),100000001,1001,종로2가사거리(00075),162,123,77,147,...,256,273,160,252,187,196,138,10,서울간선버스,20240203
4,202401,100,100번(하계동~용산구청),100000002,1002,창경궁.서울대학교병원(00031),0,0,0,0,...,17,46,31,43,24,25,2,10,서울간선버스,20240203
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506879,202412,9404,9404번(분당구미~신사역),999800003,~,구미동차고지(기점가상)(00001),0,6,0,8,...,1,1,2,2,2,0,1,50,서울광역버스,20250103
506880,202412,9404,9404번(분당구미~신사역),999800004,~,구미동차고지(종점가상)(00060),0,11,0,12,...,2,0,4,0,10,0,4,50,서울광역버스,20250103
506881,202412,9409,9409번(구미동차고지~신사역),999800004,~,구미동차고지(종점가상)(00086),0,1,0,1,...,1,0,2,0,1,0,0,50,서울광역버스,20250103
506882,202412,9707,9707번(고양 가좌동~영등포역),999800005,~,가좌동종점(종점가상)(00078),0,11,0,9,...,7,0,5,0,5,0,11,50,서울광역버스,20250103


In [71]:
# 3) 불필요 컬럼 삭제
drop_cols = [
    '노선명', '표준버스정류장ID', '버스정류장ARS번호',
    '교통수단타입코드', '교통수단타입명', '등록일자'
]
# 존재하는 컬럼만 골라서 삭제
cols_to_drop = [c for c in drop_cols if c in df.columns]
df.drop(columns=cols_to_drop, inplace=True)


In [72]:
df

Unnamed: 0,사용년월,노선번호,역명,00시승차총승객수,00시하차총승객수,1시승차총승객수,1시하차총승객수,2시승차총승객수,2시하차총승객수,3시승차총승객수,...,19시승차총승객수,19시하차총승객수,20시승차총승객수,20시하차총승객수,21시승차총승객수,21시하차총승객수,22시승차총승객수,22시하차총승객수,23시승차총승객수,23시하차총승객수
0,202401,470,종로2가사거리(00066),63,99,0,0,0,0,0,...,340,394,371,261,400,228,327,155,240,125
1,202401,N37,종로2가사거리(00089),0,0,0,0,68,97,215,...,0,0,0,0,0,0,0,0,0,0
2,202401,N37,종로2가사거리(00032),8,8,187,195,82,91,0,...,0,0,0,0,0,0,0,0,0,0
3,202401,741,종로2가사거리(00075),162,123,77,147,0,0,0,...,338,489,278,256,273,160,252,187,196,138
4,202401,100,창경궁.서울대학교병원(00031),0,0,0,0,0,0,0,...,94,24,69,17,46,31,43,24,25,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506879,202412,9404,구미동차고지(기점가상)(00001),0,6,0,8,0,0,0,...,0,0,0,1,1,2,2,2,0,1
506880,202412,9404,구미동차고지(종점가상)(00060),0,11,0,12,0,0,0,...,0,6,0,2,0,4,0,10,0,4
506881,202412,9409,구미동차고지(종점가상)(00086),0,1,0,1,0,1,0,...,0,0,0,1,0,2,0,1,0,0
506882,202412,9707,가좌동종점(종점가상)(00078),0,11,0,9,0,3,0,...,0,5,0,7,0,5,0,5,0,11


In [73]:
# 4) '사용년원' → 'season'으로 리네이밍 & 계절 매핑
df.rename(columns={'사용년월': 'season'}, inplace=True)
# month 추출 후 람다로 계절 할당
df['season'] = (
    pd.to_datetime(df['season'].astype(str), format='%Y%m')
      .dt.month
      .map(lambda m:
           'winter' if m in [12, 1, 2] else
           'spring' if m in [3, 4, 5] else
           'summer' if m in [6, 7, 8] else
           'fall')
)


In [79]:
df

Unnamed: 0,season,노선번호,역명,00시승차총승객수,00시하차총승객수,1시승차총승객수,1시하차총승객수,2시승차총승객수,2시하차총승객수,3시승차총승객수,...,19시승차총승객수,19시하차총승객수,20시승차총승객수,20시하차총승객수,21시승차총승객수,21시하차총승객수,22시승차총승객수,22시하차총승객수,23시승차총승객수,23시하차총승객수
0,winter,470,종로2가사거리(00066),63,99,0,0,0,0,0,...,340,394,371,261,400,228,327,155,240,125
1,winter,N37,종로2가사거리(00089),0,0,0,0,68,97,215,...,0,0,0,0,0,0,0,0,0,0
2,winter,N37,종로2가사거리(00032),8,8,187,195,82,91,0,...,0,0,0,0,0,0,0,0,0,0
3,winter,741,종로2가사거리(00075),162,123,77,147,0,0,0,...,338,489,278,256,273,160,252,187,196,138
4,winter,100,창경궁.서울대학교병원(00031),0,0,0,0,0,0,0,...,94,24,69,17,46,31,43,24,25,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506879,winter,9404,구미동차고지(기점가상)(00001),0,6,0,8,0,0,0,...,0,0,0,1,1,2,2,2,0,1
506880,winter,9404,구미동차고지(종점가상)(00060),0,11,0,12,0,0,0,...,0,6,0,2,0,4,0,10,0,4
506881,winter,9409,구미동차고지(종점가상)(00086),0,1,0,1,0,1,0,...,0,0,0,1,0,2,0,1,0,0
506882,winter,9707,가좌동종점(종점가상)(00078),0,11,0,9,0,3,0,...,0,5,0,7,0,5,0,5,0,11


In [77]:
# 5) 그룹화 기준 설정 (예: 계절, 버스번호, 정류장명)
group_cols = ['season', '노선번호', '역명']  # 실제 컬럼명 확인 후 수정

# 6) 합산 집계
agg_df = (
    df
    .groupby(group_cols, as_index=False)
    .sum()   # 숫자형 컬럼들은 모두 합산
)


In [78]:
agg_df

Unnamed: 0,season,노선번호,역명,00시승차총승객수,00시하차총승객수,1시승차총승객수,1시하차총승객수,2시승차총승객수,2시하차총승객수,3시승차총승객수,...,19시승차총승객수,19시하차총승객수,20시승차총승객수,20시하차총승객수,21시승차총승객수,21시하차총승객수,22시승차총승객수,22시하차총승객수,23시승차총승객수,23시하차총승객수
0,fall,0017,남이장군사당(00017),0,0,0,0,0,0,0,...,24,78,19,46,13,67,7,63,5,15
1,fall,0017,남이장군사당(00027),3,7,0,0,0,0,0,...,31,198,20,147,17,174,18,138,5,109
2,fall,0017,산천동(00015),0,0,0,0,0,0,0,...,115,81,61,85,50,84,47,72,3,36
3,fall,0017,산천동(00029),1,24,0,0,0,0,0,...,21,598,19,423,38,350,8,328,7,202
4,fall,0017,산천동리버힐삼성아파트(00030),0,23,0,0,0,0,0,...,64,481,33,332,31,257,14,245,9,155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178898,winter,청와대A01,국립고궁박물관(00002),0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
178899,winter,청와대A01,청와대(00003),0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
178900,winter,청와대A01,춘추문(00004),0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
178901,winter,청와대A01,효자로입구(00001),0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [80]:
# 7) 각 그룹별 데이터 개수(몇 개가 합쳐졌는지) 구하기
counts = (
    df
    .groupby(group_cols)
    .size()
    .rename('count')   # Series 이름을 count로
    .reset_index()
)

In [81]:
counts

Unnamed: 0,season,노선번호,역명,count
0,fall,0017,남이장군사당(00017),1
1,fall,0017,남이장군사당(00027),1
2,fall,0017,산천동(00015),1
3,fall,0017,산천동(00029),1
4,fall,0017,산천동리버힐삼성아파트(00030),1
...,...,...,...,...
178898,winter,청와대A01,국립고궁박물관(00002),1
178899,winter,청와대A01,청와대(00003),1
178900,winter,청와대A01,춘추문(00004),1
178901,winter,청와대A01,효자로입구(00001),1


In [82]:
# 8) 합산 결과(agg_df)와 개수(counts)를 병합
avg_df = pd.merge(agg_df, counts, on=group_cols)

# 9) 숫자형 컬럼(시간대별 승·하차)만 count로 나누기
#    우선 평균을 계산할 대상 컬럼 리스트
num_cols = [c for c in avg_df.columns
            if any(ch in c for ch in ['승차총승객수','하차총승객수'])]

# 10) 각 숫자 컬럼을 count로 나눠 평균 만들기
for col in num_cols:
    avg_df[col] = avg_df[col] / avg_df['count']

# 11) 더는 필요 없는 count 컬럼 제거 (선택)
avg_df.drop(columns=['count'], inplace=True)


In [83]:
avg_df

Unnamed: 0,season,노선번호,역명,00시승차총승객수,00시하차총승객수,1시승차총승객수,1시하차총승객수,2시승차총승객수,2시하차총승객수,3시승차총승객수,...,19시승차총승객수,19시하차총승객수,20시승차총승객수,20시하차총승객수,21시승차총승객수,21시하차총승객수,22시승차총승객수,22시하차총승객수,23시승차총승객수,23시하차총승객수
0,fall,0017,남이장군사당(00017),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,24.0,78.0,19.0,46.0,13.0,67.0,7.0,63.0,5.0,15.0
1,fall,0017,남이장군사당(00027),3.0,7.0,0.0,0.0,0.0,0.0,0.0,...,31.0,198.0,20.0,147.0,17.0,174.0,18.0,138.0,5.0,109.0
2,fall,0017,산천동(00015),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,115.0,81.0,61.0,85.0,50.0,84.0,47.0,72.0,3.0,36.0
3,fall,0017,산천동(00029),1.0,24.0,0.0,0.0,0.0,0.0,0.0,...,21.0,598.0,19.0,423.0,38.0,350.0,8.0,328.0,7.0,202.0
4,fall,0017,산천동리버힐삼성아파트(00030),0.0,23.0,0.0,0.0,0.0,0.0,0.0,...,64.0,481.0,33.0,332.0,31.0,257.0,14.0,245.0,9.0,155.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178898,winter,청와대A01,국립고궁박물관(00002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
178899,winter,청와대A01,청와대(00003),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
178900,winter,청와대A01,춘추문(00004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
178901,winter,청와대A01,효자로입구(00001),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
# 1) 시간대별 순유입 컬럼 생성
# hours 리스트: '00'만 두 자리, 나머지는 한 자리 문자열
hours = ['00'] + [str(h) for h in range(1, 24)]
net_cols = []

for h in hours:
    board_col = f"{h}시승차총승객수"
    alight_col = f"{h}시하차총승객수"
    net_col = f"{h}시총유입승객수"

    # 순유입 = 승차 - 하차
    avg_df[net_col] = avg_df[board_col] - avg_df[alight_col]
    net_cols.append(net_col)

# 2) 원본 승·하차 컬럼 삭제
drop_cols = [f"{h}시승차총승객수" for h in hours] + \
            [f"{h}시하차총승객수" for h in hours]
avg_df.drop(columns=drop_cols, inplace=True)

# 3) 컬럼 순서 재정렬: 그룹 기준(3개) + 순유입(24개)
group_cols = ['season', '노선번호', '역명']
avg_df = avg_df[group_cols + net_cols]

# (선택) 결과 확인
print(avg_df.shape)   # (행 수, 27)
print(avg_df.columns.tolist())


(178903, 27)
['season', '노선번호', '역명', '00시총유입승객수', '1시총유입승객수', '2시총유입승객수', '3시총유입승객수', '4시총유입승객수', '5시총유입승객수', '6시총유입승객수', '7시총유입승객수', '8시총유입승객수', '9시총유입승객수', '10시총유입승객수', '11시총유입승객수', '12시총유입승객수', '13시총유입승객수', '14시총유입승객수', '15시총유입승객수', '16시총유입승객수', '17시총유입승객수', '18시총유입승객수', '19시총유입승객수', '20시총유입승객수', '21시총유입승객수', '22시총유입승객수', '23시총유입승객수']


In [85]:
avg_df

Unnamed: 0,season,노선번호,역명,00시총유입승객수,1시총유입승객수,2시총유입승객수,3시총유입승객수,4시총유입승객수,5시총유입승객수,6시총유입승객수,...,14시총유입승객수,15시총유입승객수,16시총유입승객수,17시총유입승객수,18시총유입승객수,19시총유입승객수,20시총유입승객수,21시총유입승객수,22시총유입승객수,23시총유입승객수
0,fall,0017,남이장군사당(00017),0.0,0.0,0.0,0.0,0.0,-37.0,-5.0,...,-28.0,-81.0,-67.0,1.0,-135.0,-54.0,-27.0,-54.0,-56.0,-10.0
1,fall,0017,남이장군사당(00027),-4.0,0.0,0.0,0.0,0.0,-1.0,-38.0,...,-62.0,-84.0,-115.0,-178.0,-214.0,-167.0,-127.0,-157.0,-120.0,-104.0
2,fall,0017,산천동(00015),0.0,0.0,0.0,0.0,0.0,170.0,206.0,...,316.0,250.0,410.0,294.0,198.0,34.0,-24.0,-34.0,-25.0,-33.0
3,fall,0017,산천동(00029),-23.0,0.0,0.0,0.0,0.0,20.0,-51.0,...,-237.0,-348.0,-337.0,-474.0,-462.0,-577.0,-404.0,-312.0,-320.0,-195.0
4,fall,0017,산천동리버힐삼성아파트(00030),-23.0,0.0,0.0,0.0,0.0,22.0,90.0,...,-134.0,-180.0,-235.0,-332.0,-419.0,-417.0,-299.0,-226.0,-231.0,-146.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178898,winter,청와대A01,국립고궁박물관(00002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,20.0,22.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
178899,winter,청와대A01,청와대(00003),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-70.0,-78.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
178900,winter,청와대A01,춘추문(00004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
178901,winter,청와대A01,효자로입구(00001),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,137.0,103.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
import pandas as pd

# 1) hours 리스트: '00','1','2',...,'23'
hours = ['00'] + [str(h) for h in range(1, 24)]

# 2) net inflow 컬럼명
net_cols = [f"{h}시총유입승객수" for h in hours]

# 3) 누적 계산 함수: hours 리스트를 재활용
def calc_cumulative(row):
    route = str(row['노선번호'])
    # 야간 운행: 23 → 00 → 1 → 2 → 3 → 4 → 5
    if route.startswith('N'):
        seq = [hours[23]] + hours[0:6]      # ['23','00','1','2','3','4','5']
    # 주간 운행: 4 → 5 → ... → 23 → 00 → 1
    else:
        seq = hours[4:24] + [hours[0], hours[1]]  # ['4',...,'23','00','1']
    total = 0
    out = {}
    for h in seq:
        key_in  = f"{h}시총유입승객수"
        key_out = f"{h}시총승객수"
        total += row[key_in]
        out[key_out] = total
    return pd.Series(out)

# 4) 각 행에 적용
cum_df = avg_df.apply(calc_cumulative, axis=1)

# 5) 원본 net inflow 컬럼들 삭제
avg_df = avg_df.drop(columns=net_cols)

# 6) 누적 컬럼 병합 & 컬럼 순서 정리
cum_cols = [f"{h}시총승객수" for h in hours]
avg_df = pd.concat([avg_df, cum_df], axis=1)
avg_df = avg_df[['season','노선번호','역명'] + cum_cols]

# 확인
print(avg_df.shape)
print(avg_df.columns.tolist())


Exception ignored in: <function ZipFile.__del__ at 0x7b0d3fcb4c20>
Traceback (most recent call last):
  File "/usr/lib/python3.11/zipfile.py", line 1895, in __del__
    self.close()
  File "/usr/lib/python3.11/zipfile.py", line 1912, in close
    self.fp.seek(self.start_dir)
ValueError: seek of closed file


(178903, 27)
['season', '노선번호', '역명', '00시총승객수', '1시총승객수', '2시총승객수', '3시총승객수', '4시총승객수', '5시총승객수', '6시총승객수', '7시총승객수', '8시총승객수', '9시총승객수', '10시총승객수', '11시총승객수', '12시총승객수', '13시총승객수', '14시총승객수', '15시총승객수', '16시총승객수', '17시총승객수', '18시총승객수', '19시총승객수', '20시총승객수', '21시총승객수', '22시총승객수', '23시총승객수']


In [89]:
avg_df

Unnamed: 0,season,노선번호,역명,00시총승객수,1시총승객수,2시총승객수,3시총승객수,4시총승객수,5시총승객수,6시총승객수,...,14시총승객수,15시총승객수,16시총승객수,17시총승객수,18시총승객수,19시총승객수,20시총승객수,21시총승객수,22시총승객수,23시총승객수
0,fall,0017,남이장군사당(00017),-709.0,-709.0,,,0.0,-37.0,-42.0,...,-226.0,-307.0,-374.0,-373.0,-508.0,-562.0,-589.0,-643.0,-699.0,-709.0
1,fall,0017,남이장군사당(00027),-1495.0,-1495.0,,,0.0,-1.0,-39.0,...,-225.0,-309.0,-424.0,-602.0,-816.0,-983.0,-1110.0,-1267.0,-1387.0,-1491.0
2,fall,0017,산천동(00015),5060.0,5060.0,,,0.0,170.0,376.0,...,3990.0,4240.0,4650.0,4944.0,5142.0,5176.0,5152.0,5118.0,5093.0,5060.0
3,fall,0017,산천동(00029),-4738.0,-4738.0,,,0.0,20.0,-31.0,...,-1286.0,-1634.0,-1971.0,-2445.0,-2907.0,-3484.0,-3888.0,-4200.0,-4520.0,-4715.0
4,fall,0017,산천동리버힐삼성아파트(00030),-2753.0,-2753.0,,,0.0,22.0,112.0,...,-245.0,-425.0,-660.0,-992.0,-1411.0,-1828.0,-2127.0,-2353.0,-2584.0,-2730.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178898,winter,청와대A01,국립고궁박물관(00002),123.0,123.0,,,0.0,0.0,0.0,...,81.0,103.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0
178899,winter,청와대A01,청와대(00003),-310.0,-310.0,,,0.0,0.0,0.0,...,-231.0,-309.0,-310.0,-310.0,-310.0,-310.0,-310.0,-310.0,-310.0,-310.0
178900,winter,청와대A01,춘추문(00004),-60.0,-60.0,,,0.0,0.0,0.0,...,-81.0,-77.0,-60.0,-60.0,-60.0,-60.0,-60.0,-60.0,-60.0,-60.0
178901,winter,청와대A01,효자로입구(00001),514.0,514.0,,,0.0,0.0,0.0,...,411.0,514.0,514.0,514.0,514.0,514.0,514.0,514.0,514.0,514.0


In [90]:
# 1) NaN → 0
avg_df.fillna(0, inplace=True)

# 2) 숫자형 컬럼(누적 승객수)만 절댓값으로 변환
#    그룹 키컬럼은 제외
num_cols = [c for c in avg_df.columns if c.endswith('시총승객수')]
avg_df[num_cols] = avg_df[num_cols].abs()


In [91]:
avg_df

Unnamed: 0,season,노선번호,역명,00시총승객수,1시총승객수,2시총승객수,3시총승객수,4시총승객수,5시총승객수,6시총승객수,...,14시총승객수,15시총승객수,16시총승객수,17시총승객수,18시총승객수,19시총승객수,20시총승객수,21시총승객수,22시총승객수,23시총승객수
0,fall,0017,남이장군사당(00017),709.0,709.0,0.0,0.0,0.0,37.0,42.0,...,226.0,307.0,374.0,373.0,508.0,562.0,589.0,643.0,699.0,709.0
1,fall,0017,남이장군사당(00027),1495.0,1495.0,0.0,0.0,0.0,1.0,39.0,...,225.0,309.0,424.0,602.0,816.0,983.0,1110.0,1267.0,1387.0,1491.0
2,fall,0017,산천동(00015),5060.0,5060.0,0.0,0.0,0.0,170.0,376.0,...,3990.0,4240.0,4650.0,4944.0,5142.0,5176.0,5152.0,5118.0,5093.0,5060.0
3,fall,0017,산천동(00029),4738.0,4738.0,0.0,0.0,0.0,20.0,31.0,...,1286.0,1634.0,1971.0,2445.0,2907.0,3484.0,3888.0,4200.0,4520.0,4715.0
4,fall,0017,산천동리버힐삼성아파트(00030),2753.0,2753.0,0.0,0.0,0.0,22.0,112.0,...,245.0,425.0,660.0,992.0,1411.0,1828.0,2127.0,2353.0,2584.0,2730.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178898,winter,청와대A01,국립고궁박물관(00002),123.0,123.0,0.0,0.0,0.0,0.0,0.0,...,81.0,103.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0
178899,winter,청와대A01,청와대(00003),310.0,310.0,0.0,0.0,0.0,0.0,0.0,...,231.0,309.0,310.0,310.0,310.0,310.0,310.0,310.0,310.0,310.0
178900,winter,청와대A01,춘추문(00004),60.0,60.0,0.0,0.0,0.0,0.0,0.0,...,81.0,77.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0
178901,winter,청와대A01,효자로입구(00001),514.0,514.0,0.0,0.0,0.0,0.0,0.0,...,411.0,514.0,514.0,514.0,514.0,514.0,514.0,514.0,514.0,514.0


In [92]:
avg_df.to_excel('metro_boarding_alighting_summary.xlsx', index=False)


KeyboardInterrupt: 

In [93]:
avg_df.to_csv('metro_boarding_alighting_summary.csv', index=False)
