## 판다스 빈 행 어떻게 읽어오나? -> 알아서 안읽음

In [9]:
import pandas as pd
from io import StringIO

# CSV 파일을 읽어올 때 빈 행을 포함하는 예시
data = """
DAY_TYPE,BUSINFOUNIT_ID,LEN,DEP_TIME,TIME_GAP
1,000010002,388,00:00,127

3,000010004,259,00:00,43
2,000010006,500,01:00,200
"""

# pandas로 CSV 데이터 읽기
df = pd.read_csv(StringIO(data), dtype={'BUSINFOUNIT_ID':str})
print(df)


   DAY_TYPE BUSINFOUNIT_ID  LEN DEP_TIME  TIME_GAP
0         1      000010002  388    00:00       127
1         3      000010004  259    00:00        43
2         2      000010006  500    01:00       200


In [5]:
import pandas as pd
# 결측값 ,, -> NaN 으로 됨
data = """
DAY_TYPE,BUSINFOUNIT_ID,LEN,DEP_TIME,TIME_GAP
1,000010002,388,00:00,127
3,000010004,,00:00,43
2,000010006,500,01:00,200
"""

df = pd.read_csv(StringIO(data), dtype={'BUSINFOUNIT_ID':str})
print(df)

   DAY_TYPE BUSINFOUNIT_ID    LEN DEP_TIME  TIME_GAP
0         1      000010002  388.0    00:00       127
1         3      000010004    NaN    00:00        43
2         2      000010006  500.0    01:00       200


In [30]:
import os
root_dir = os.path.relpath('/')  # '/'를 절대 경로로 변환

print(f"루트 디렉토리: {root_dir}")

루트 디렉토리: ..\..\..\..\..


In [38]:
relative_path = './'
current_dir = os.getcwd()
target_dir = os.path.abspath(os.path.join(current_dir, relative_path))

print(f"현재 디렉토리: {current_dir}")
print(f"../../: {target_dir}")

현재 디렉토리: D:\WorkSpaces\pyCharm_workspace\busArrivalPred\src\preprocess
../../: D:\WorkSpaces\pyCharm_workspace\busArrivalPred\src\preprocess


# LSTM 빈칸 -1 채우기

In [25]:
import pandas as pd
import numpy as np

# 원본 데이터 생성
data = {
    'DAY_TYPE': [3, 3, 3, 3, 3, 3, 3, 3],
    'BUSINFOUNIT_ID': ['000010783'] * 8,
    'LEN': [292] * 8,
    'DEP_TIME': pd.date_range(start='2024-06-04 07:00:00', periods=8, freq='1min'),
}
data_pd = pd.DataFrame(data)

# 예측 결과 생성 (실제 모델 예측 대신 임의의 값 사용)
predictions_min = [[71], [75], [68]]

# 설정
config = {'seq_length': 5}

# 예측 결과를 데이터프레임으로 변환
predictions_min_flat = [item[0] for item in predictions_min]
predictions_df = pd.DataFrame({'TIME_GAP_ESTIMATE': np.round(predictions_min_flat, 0).astype(int)})

print("Original data:")
print(data_pd)
print("\nPredictions:")
print(predictions_df)

Original data:
   DAY_TYPE BUSINFOUNIT_ID  LEN            DEP_TIME
0         3      000010783  292 2024-06-04 07:00:00
1         3      000010783  292 2024-06-04 07:01:00
2         3      000010783  292 2024-06-04 07:02:00
3         3      000010783  292 2024-06-04 07:03:00
4         3      000010783  292 2024-06-04 07:04:00
5         3      000010783  292 2024-06-04 07:05:00
6         3      000010783  292 2024-06-04 07:06:00
7         3      000010783  292 2024-06-04 07:07:00

Predictions:
   TIME_GAP_ESTIMATE
0                 71
1                 75
2                 68


In [26]:

# 원본 데이터프레임의 인덱스와 매칭
data_pd = data_pd.reset_index(drop=True)
predictions_df.index = data_pd.index[config['seq_length']:len(predictions_df) + config['seq_length']]

print("\nPredictions with adjusted index:")
print(predictions_df)



Predictions with adjusted index:
   TIME_GAP_ESTIMATE
5                 71
6                 75
7                 68


In [27]:

# 예측 결과를 원본 데이터프레임에 추가
data_pd['TIME_GAP_ESTIMATE'] = -1  # 모든 행을 -1로 초기화
data_pd.loc[predictions_df.index, 'TIME_GAP_ESTIMATE'] = predictions_df['TIME_GAP_ESTIMATE']

print("\nFinal result:")
print(data_pd)



Final result:
   DAY_TYPE BUSINFOUNIT_ID  LEN            DEP_TIME  TIME_GAP_ESTIMATE
0         3      000010783  292 2024-06-04 07:00:00                 -1
1         3      000010783  292 2024-06-04 07:01:00                 -1
2         3      000010783  292 2024-06-04 07:02:00                 -1
3         3      000010783  292 2024-06-04 07:03:00                 -1
4         3      000010783  292 2024-06-04 07:04:00                 -1
5         3      000010783  292 2024-06-04 07:05:00                 71
6         3      000010783  292 2024-06-04 07:06:00                 75
7         3      000010783  292 2024-06-04 07:07:00                 68


## 그룹화된 데이터가 매우 적거나

In [2]:
import pandas as pd

data = {
    'BUSROUTE_ID': ['210000009', '210000009', '210000009', '210000009', '210000009',
                    '210000010', '210000010', '210000010', '210000010', '210000010'],
    'BUSINFOUNIT_ID': ['000011248', '000011248', '000011248', '000011248', '000011248',
                       '000011249', '000011249', '000011249', '000011249', '000011249'],
    'DEP_TIME': ['2024-06-04 07:00:00', '2024-06-04 07:01:00', '2024-06-04 07:02:00', '2024-06-04 07:03:00', '2024-06-04 07:04:00',
                 '2024-06-04 08:00:00', '2024-06-04 08:01:00', '2024-06-04 08:02:00', '2024-06-04 08:03:00', '2024-06-04 08:04:00'],
    'LEN': [500, 550, 600, 620, 700, 400, 450, 480, 500, 550],
    'TIME_GAP': [60, 70, 80, 90, 100, 50, 60, 70, 80, 90]
}
data_pd = pd.DataFrame(data)
print(data_pd)

  BUSROUTE_ID BUSINFOUNIT_ID             DEP_TIME  LEN  TIME_GAP
0   210000009      000011248  2024-06-04 07:00:00  500        60
1   210000009      000011248  2024-06-04 07:01:00  550        70
2   210000009      000011248  2024-06-04 07:02:00  600        80
3   210000009      000011248  2024-06-04 07:03:00  620        90
4   210000009      000011248  2024-06-04 07:04:00  700       100
5   210000010      000011249  2024-06-04 08:00:00  400        50
6   210000010      000011249  2024-06-04 08:01:00  450        60
7   210000010      000011249  2024-06-04 08:02:00  480        70
8   210000010      000011249  2024-06-04 08:03:00  500        80
9   210000010      000011249  2024-06-04 08:04:00  550        90


In [3]:
# 특정 그룹에 3개 데이터만 있는 경우
group_data = {
    'BUSROUTE_ID': ['210000011', '210000011', '210000011'],
    'BUSINFOUNIT_ID': ['000011250', '000011250', '000011250'],
    'DEP_TIME': ['2024-06-04 09:00:00', '2024-06-04 09:01:00', '2024-06-04 09:02:00'],
    'LEN': [450, 500, 550],
    'TIME_GAP': [50, 55, 60]
}
group_df = pd.DataFrame(group_data)
print(group_df)

  BUSROUTE_ID BUSINFOUNIT_ID             DEP_TIME  LEN  TIME_GAP
0   210000011      000011250  2024-06-04 09:00:00  450        50
1   210000011      000011250  2024-06-04 09:01:00  500        55
2   210000011      000011250  2024-06-04 09:02:00  550        60


In [4]:
import numpy as np

# 시퀀스 길이와 실제 데이터 길이가 다를 경우, 패딩 추가
seq_length = 5
features = np.array([[450], [500], [550]])  # 그룹 내 실제 데이터

# 부족한 시퀀스에 0으로 패딩 추가
if len(features) < seq_length:
    padding = np.zeros((seq_length - len(features), features.shape[1]))
    padded_sequence = np.vstack((features, padding))

print(padded_sequence)


[[450.]
 [500.]
 [550.]
 [  0.]
 [  0.]]


# csv 정렬

In [7]:
import pandas as pd
import os
csv = '../../dataset/inference/route/LSTM/inf_0604.csv'
dtype_spec = {
    'DAY_TYPE': 'int8',
    'BUSROUTE_ID': 'str',
    'BUSINFOUNIT_ID': 'str',
    'LEN': 'int32',
    'DEP_TIME': 'str',
    # 'TIME_GAP': 'int32',
    'TIME_GAP': 'float32',  # int32는 NaN 값을 처리할 수 없으므로 float32로 변경
    # 'SPEED': 'int32'
    # 'SPEED': 'float32'
}
usecols = ['DAY_TYPE', 'BUSROUTE_ID', 'BUSINFOUNIT_ID', 'LEN', 'DEP_TIME', 'TIME_GAP']

data_pd = pd.read_csv(csv, skipinitialspace=True, usecols=usecols, dtype=dtype_spec)
data_pd.sort_values(by=['BUSROUTE_ID', 'BUSINFOUNIT_ID', 'DEP_TIME'], inplace=True)

# 결과 저장
results_folder = f'../../dataset/inference/route/LSTM'
os.makedirs(results_folder, exist_ok=True)
result_file = os.path.join(results_folder, 'inf_0604_sort.csv')
data_pd.to_csv(result_file, index=False)
print(f"정렬 파일 저장 완료: {result_file}")


정렬 파일 저장 완료: ../../../dataset/inference/route/LSTM\inf_0604_sort.csv


# 시간대 필터링 저장

In [4]:
import pandas as pd
import os

csv = '../../dataset/inference/route/241007_모든노선_8.1~8.14_평일_특성추가/inf_filtered.csv'
# dtype_spec = {
#     'DAY_TYPE': 'int8',
#     'BUSROUTE_ID': 'str',
#     'BUSINFOUNIT_ID': 'str',
#     'LEN': 'int32',
#     'DEP_TIME': 'str',
#     # 'TIME_GAP': 'int32',
#     'TIME_GAP': 'float32',  # int32는 NaN 값을 처리할 수 없으므로 float32로 변경
#     # 'SPEED': 'int32'
#     'SPEED': 'float32'
# }
# usecols = ['BUSROUTE_ID', 'BUSINFOUNIT_ID', 'LEN', 'DEP_TIME', 'SPEED', 'TIME_GAP']
dtype_spec = {
        'DAY_TYPE': 'int8',
        'BUSROUTE_ID': 'str',
        'PEEK_ALLOC': 'int16',
        'NPEEK_ALLOC': 'int16',
        'ROUTE_LEN': 'int32',
        'BUSSTOP_CNT': 'int16',
        'BUSINFOUNIT_ID': 'str',
        'INFOUNIT_SEQ': 'int16',
        'LEN': 'int32',
        'GPS_COORDX': 'float32',
        'GPS_COORDY': 'float32',
        'COLLECT_DATE': 'str',
        'DEP_TIME': 'str',
        'TIME_GAP': 'int32',
        'SPEED': 'int32'
    }
usecols = [
    'DAY_TYPE', 'BUSROUTE_ID', 'PEEK_ALLOC', 'NPEEK_ALLOC',
    'ROUTE_LEN', 'BUSSTOP_CNT', 'BUSINFOUNIT_ID', 'INFOUNIT_SEQ',
    'LEN', 'GPS_COORDX', 'GPS_COORDY', 'DEP_TIME', 'TIME_GAP'
]
data_pd = pd.read_csv(csv, skipinitialspace=True, usecols=usecols, dtype=dtype_spec)

# SPEED 컬럼의 NaN 또는 빈 값 제거
data_pd = data_pd.dropna(subset=['SPEED'])
data_pd = data_pd[data_pd['SPEED'] != '']
# TIME_GAP 컬럼의 NaN 또는 빈 값 제거
data_pd = data_pd.dropna(subset=['TIME_GAP'])
data_pd = data_pd[data_pd['TIME_GAP'] != '']

data_pd.reset_index(drop=True, inplace=True)

# TIME_GAP 컬럼 정수로
data_pd['TIME_GAP'] = data_pd['TIME_GAP'].astype('int32')
# SPEED 컬럼 정수로 반올림
data_pd['SPEED'] = data_pd['SPEED'].round().astype('int32')

# 2. 필요한 시간대 필터링 (07:00~09:00, 13:00~15:00, 18:00~20:00)
valid_times = (
    ((data_pd['DEP_TIME'] >= '07:00') & (data_pd['DEP_TIME'] <= '09:00')) |
    ((data_pd['DEP_TIME'] >= '13:00') & (data_pd['DEP_TIME'] <= '15:00')) |
    ((data_pd['DEP_TIME'] >= '18:00') & (data_pd['DEP_TIME'] <= '20:00'))
)

# 3. 해당 시간대의 데이터만 추출
data_pd = data_pd[valid_times]

# 중복 제거
data_pd = data_pd.drop_duplicates(subset=['BUSROUTE_ID', 'BUSINFOUNIT_ID', 'DEP_TIME'], keep='first')

# 결과 저장
input_filename = os.path.basename(csv)
output_filename = os.path.splitext(input_filename)[0] + '_filtered.csv'
result_file = os.path.join(os.path.dirname(csv), output_filename)
data_pd.to_csv(result_file, index=False)

# 스케일러

In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# 샘플 데이터 생성
train_data = pd.DataFrame({'feature1': [1, 2, 3, 4, 5],
                           'feature2': [10, 20, 30, 40, 50]})
test_data = pd.DataFrame({'feature1': [6, 7, 8],
                          'feature2': [60, 70, 80]})

# StandardScaler 객체 생성
scaler = StandardScaler()

# 1. fit 메서드: 학습 데이터의 평균과 표준 편차 계산
scaler.fit(train_data) 

# 2. transform 메서드: 학습 데이터와 테스트 데이터 스케일 조정
scaled_train_data = scaler.transform(train_data)
scaled_test_data = scaler.transform(test_data)

# 결과 출력
print("Original Train Data:\n", train_data)
print("\nScaled Train Data:\n", scaled_train_data)
print("\nOriginal Test Data:\n", test_data)
print("\nScaled Test Data:\n", scaled_test_data)

# 평균과 표준편차 출력
print("\nMean:\n", scaler.mean_)
print("\nStandard Deviation:\n", scaler.scale_)


Original Train Data:
    feature1  feature2
0         1        10
1         2        20
2         3        30
3         4        40
4         5        50

Scaled Train Data:
 [[-1.41421356 -1.41421356]
 [-0.70710678 -0.70710678]
 [ 0.          0.        ]
 [ 0.70710678  0.70710678]
 [ 1.41421356  1.41421356]]

Original Test Data:
    feature1  feature2
0         6        60
1         7        70
2         8        80

Scaled Test Data:
 [[2.12132034 2.12132034]
 [2.82842712 2.82842712]
 [3.53553391 3.53553391]]

Mean:
 [ 3. 30.]

Standard Deviation:
 [ 1.41421356 14.14213562]


# 해시 인코딩

In [23]:
import hashlib

def hash_function(value, hash_size):
    # 문자열로 변환 후 해시 적용
    if not isinstance(value, str):
        value = str(value)
    # MD5 해시 함수 적용 후, 16진수를 10진수로 변환
    hashed_value = int(hashlib.md5(value.encode()).hexdigest(), 16)
    # hash_size 크기에 맞게 나머지 연산을 통해 고정된 범위로 변환
    return hashed_value % hash_size

# 해시 사이즈 설정 (예: 100)
hash_size = 100

# BUSROUTE_ID와 BUSINFOUNIT_ID 각각에 해시 인코딩 적용
busroute_ids = ['1001', '1002', '1003', '1001']
businfounit_ids = ['A001', 'B002', 'A001', 'B003']

# 해시 인코딩 결과
hashed_busroute_ids = [hash_function(busroute_id, hash_size) for busroute_id in busroute_ids]
hashed_businfounit_ids = [hash_function(businfounit_id, hash_size) for businfounit_id in businfounit_ids]

# 결과 출력
print("Hashed BUSROUTE_IDs:", hashed_busroute_ids)
print("Hashed BUSINFOUNIT_IDs:", hashed_businfounit_ids)


Hashed BUSROUTE_IDs: [6, 10, 48, 6]
Hashed BUSINFOUNIT_IDs: [46, 3, 46, 36]
