In [59]:
from datetime import datetime
def parse_time(timestamp):
    """
    'YYYY-MM-DD HH:MM:SS AM/PM', 'YYYY-MM-DD HH:MM:SS', 또는 'YYYY-MM-DD HH:MM' 형식을 파싱하는 함수
    """
    timestamp = timestamp.split('.')[0]
    try:
        # 먼저 24시간 형식으로 파싱 시도
        return datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')
    except ValueError:
        try:
            # 실패하면 12시간 형식으로 파싱 시도
            return datetime.strptime(timestamp, '%Y-%m-%d %I:%M:%S %p')
        except ValueError:
            # 실패하면 'YYYY-MM-DD HH:MM' 형식으로 파싱 시도
            return datetime.strptime(timestamp, '%Y-%m-%d %H:%M')

In [60]:
import pandas as pd 
bus_3 = pd.read_csv('test3_data.csv')
bus_4 = pd.read_csv('test4_data.csv')
bus_5 = pd.read_csv('test5_data.csv')
bus_df = pd.concat([bus_3,  bus_4, bus_5], axis=0, ignore_index=True)

In [61]:
import pandas as pd
import numpy as np
#bus_df = pd.read_csv('test3_data.csv')
bus_df['Parsed_Date']= bus_df.Information_Occurrence.apply(parse_time)
bus_df['original_index'] = bus_df.index  # 인덱스를 새로운 열로 추가
# 새로운 열 'up_down' 생성: 조건에 따라 0 또는 1로 설정
bus_df['up_down'] = bus_df['MASK_SELECTED'].apply(lambda x: 0 if 1 <= x <= 29 else 1 if 30 <= x <= 57 else None)
bus_df[28:42]

Unnamed: 0,MASK_SELECTED,Information_Occurrence,BUS_ROUTE,STOP_ORD,STOP_NAME,LAT,LNG,STOP_ID,Bus_num,Parsed_Date,original_index,up_down
28,29,2020-06-01 07:41:58,13,1,천안아산역,36.79431,127.10368,288002022,1446,2020-06-01 07:41:58,28,0
29,30,2020-06-01 07:44:15,13,2,호수공원,36.793755,127.101375,288002048,1446,2020-06-01 07:44:15,29,1
30,31,2020-06-01 07:44:49,13,3,아산역,36.791489,127.103548,288002219,1446,2020-06-01 07:44:49,30,1
31,32,2020-06-01 07:46:10,13,4,한국표준협회,36.791095,127.107774,288002054,1446,2020-06-01 07:46:10,31,1
32,33,2020-06-01 07:47:21,13,5,장재마을11단지,36.7865,127.1087,288002056,1446,2020-06-01 07:47:21,32,1
33,34,2020-06-01 07:48:50,13,6,쌍용동일하이빌,36.78462,127.11376,288001265,1446,2020-06-01 07:48:50,33,1
34,35,2020-06-01 07:51:45,13,7,신방초원아파트,36.787255,127.124237,285000851,1446,2020-06-01 07:51:45,34,1
35,36,2020-06-01 07:56:15,13,8,한라 동백2차아파트정문,36.788366,127.130297,285000844,1446,2020-06-01 07:56:15,35,1
36,37,2020-06-01 07:57:11,13,9,용곡더쉴아파트,36.789453,127.131249,285000841,1446,2020-06-01 07:57:11,36,1
37,38,2020-06-01 07:58:53,13,10,두레현대1단지아파트,36.792555,127.131429,285000839,1446,2020-06-01 07:58:53,37,1


In [62]:
bus_df = bus_df.drop(columns=['LAT', 'LNG'])
bus_stop = pd.read_csv('bus_route_distances2.csv')

# bus_stop에서 MASK_SELECTED를 기준으로 LAT와 LNG의 첫 번째 값을 가져오기
lat_lng_data = bus_stop.groupby('MASK_SELECTED')[['LAT', 'LNG']].first().reset_index()

# 기존 bus_df에서 MASK_SELECTED 기준으로 bus_stop의 LAT와 LNG 값을 업데이트
bus_df = pd.merge(bus_df, lat_lng_data, on='MASK_SELECTED', how='left')


In [63]:
import pandas as pd

# 데이터 로드 및 준비
# bus_df = pd.read_csv('bus_df.csv')  # 실제 파일 경로로 변경하세요
weather_df = pd.read_csv('SYNOP_AWOS_4332_MI_2020-09_2020-09_2020.csv', encoding='cp949')

# 날짜 변환 및 인덱스 설정
weather_df['Parsed_Date'] = pd.to_datetime(weather_df['일시'])
weather_df.set_index('일시', inplace=True)

# 필요한 열 선택 (기온, 상대습도, 풍향, 풍속)
weather_df = weather_df[['기온(℃)', '상대습도(%)', '풍향(16방위)', '풍속(m/s)', 'Parsed_Date']]

# bus_df의 Parsed_Date 열을 datetime으로 변환
bus_df['Parsed_Date'] = pd.to_datetime(bus_df['Parsed_Date'], errors='coerce')

# 1차 병합: merge_asof를 사용하여 가장 가까운 날짜로 병합
bus_df = pd.merge_asof(
    bus_df.sort_values('Parsed_Date'),
    weather_df.sort_values('Parsed_Date'),
    on='Parsed_Date',
    direction='nearest',
    suffixes=('_5min', '_nearest')
)

# 9월 9일과 10일 데이터를 위해 새로운 열 생성 (날짜만 9월 8일로 변경)
bus_df['Adjusted_Date'] = bus_df['Parsed_Date'].apply(
    lambda x: x.replace(day=8) if x.date() in [pd.Timestamp('2020-09-09').date(), pd.Timestamp('2020-09-10').date()] else x
)

# 2차 병합: 9월 8일의 기상 데이터와 새로운 Adjusted_Date를 기준으로 병합
bus_df = pd.merge_asof(
    bus_df.sort_values('Adjusted_Date'),
    weather_df.sort_values('Parsed_Date'),
    left_on='Adjusted_Date',
    right_on='Parsed_Date',
    direction='backward',  # backward로 설정하여 가장 가까운 이전 값을 가져옴
    suffixes=('', '_8th')
)

# 필요 없는 열 드롭
bus_df.drop(columns=['Adjusted_Date', 'Parsed_Date_8th'], inplace=True)

# 남아 있는 NaN 값을 이전 값으로 채우기
bus_df[['기온(℃)', '상대습도(%)', '풍향(16방위)', '풍속(m/s)']] = bus_df[['기온(℃)', '상대습도(%)', '풍향(16방위)', '풍속(m/s)']].ffill()

# 원래 순서대로 정렬
bus_df = bus_df.sort_values(by='original_index')

# 결과 확인
print(bus_df[28:42])


        MASK_SELECTED Information_Occurrence  BUS_ROUTE  STOP_ORD  \
136275             29    2020-06-01 07:41:58         13         1   
136288             30    2020-06-01 07:44:15         13         2   
136294             31    2020-06-01 07:44:49         13         3   
136304             32    2020-06-01 07:46:10         13         4   
136309             33    2020-06-01 07:47:21         13         5   
136319             34    2020-06-01 07:48:50         13         6   
136330             35    2020-06-01 07:51:45         13         7   
136359             36    2020-06-01 07:56:15         13         8   
136364             37    2020-06-01 07:57:11         13         9   
136377             38    2020-06-01 07:58:53         13        10   
136390             39    2020-06-01 08:01:46         13        11   
136397             40    2020-06-01 08:02:26         13        12   
136399             41    2020-06-01 08:03:49         13        13   
136403             42    2020-06-0

In [64]:
bus_df['is_weekend'] = bus_df['Parsed_Date'].dt.dayofweek >= 5
bus_df['day_name'] = bus_df['Parsed_Date'].dt.day_name()
bus_df.head()

Unnamed: 0,MASK_SELECTED,Information_Occurrence,BUS_ROUTE,STOP_ORD,STOP_NAME,STOP_ID,Bus_num,Parsed_Date,original_index,up_down,...,기온(℃),상대습도(%),풍향(16방위),풍속(m/s),기온(℃)_8th,상대습도(%)_8th,풍향(16방위)_8th,풍속(m/s)_8th,is_weekend,day_name
136117,1,2020-06-01 06:53:07,13,1,단대병원,285000640,1446,2020-06-01 06:53:07,0,0,...,25.4,78.1,93.0,0.8,,,,,False,Monday
136119,2,2020-06-01 06:53:33,13,2,단대병원,285000641,1446,2020-06-01 06:53:33,1,0,...,25.4,78.1,93.0,0.8,,,,,False,Monday
136120,3,2020-06-01 06:54:52,13,3,상명대학교,285000642,1446,2020-06-01 06:54:52,2,0,...,25.4,78.1,93.0,0.8,,,,,False,Monday
136123,4,2020-06-01 06:55:44,13,4,천안톨게이트,285000644,1446,2020-06-01 06:55:44,3,0,...,25.4,78.1,93.0,0.8,,,,,False,Monday
136125,5,2020-06-01 06:57:41,13,5,도솔광장,285001062,1446,2020-06-01 06:57:41,4,0,...,25.4,78.1,93.0,0.8,,,,,False,Monday


In [65]:
from pytimekr import pytimekr

rest_list = pytimekr.holidays(year = 2020) #리스트 형태로 반환

# 날짜만 비교하기 위해 Parsed_Date 열을 date 형식으로 변환
bus_df['Date'] = bus_df['Parsed_Date'].dt.date

bus_df.loc[(bus_df['Date'].isin(rest_list)) & (bus_df['is_weekend'] == False), 'is_weekend'] = True
bus_df.drop('Date', axis = 1)
        

Unnamed: 0,MASK_SELECTED,Information_Occurrence,BUS_ROUTE,STOP_ORD,STOP_NAME,STOP_ID,Bus_num,Parsed_Date,original_index,up_down,...,기온(℃),상대습도(%),풍향(16방위),풍속(m/s),기온(℃)_8th,상대습도(%)_8th,풍향(16방위)_8th,풍속(m/s)_8th,is_weekend,day_name
136117,1,2020-06-01 06:53:07,13,1,단대병원,285000640,1446,2020-06-01 06:53:07,0,0,...,25.4,78.1,93.0,0.8,,,,,False,Monday
136119,2,2020-06-01 06:53:33,13,2,단대병원,285000641,1446,2020-06-01 06:53:33,1,0,...,25.4,78.1,93.0,0.8,,,,,False,Monday
136120,3,2020-06-01 06:54:52,13,3,상명대학교,285000642,1446,2020-06-01 06:54:52,2,0,...,25.4,78.1,93.0,0.8,,,,,False,Monday
136123,4,2020-06-01 06:55:44,13,4,천안톨게이트,285000644,1446,2020-06-01 06:55:44,3,0,...,25.4,78.1,93.0,0.8,,,,,False,Monday
136125,5,2020-06-01 06:57:41,13,5,도솔광장,285001062,1446,2020-06-01 06:57:41,4,0,...,25.4,78.1,93.0,0.8,,,,,False,Monday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
267097,53,2020-08-31 22:02:25,13,25,대림한숲아파트,285000706,1729,2020-08-31 22:02:25,351514,1,...,25.4,78.1,93.0,0.8,,,,,False,Monday
267098,54,2020-08-31 22:04:53,13,26,천안톨게이트,285000643,1729,2020-08-31 22:04:53,351515,1,...,25.4,78.1,93.0,0.8,,,,,False,Monday
267099,55,2020-08-31 22:06:03,13,27,상명대학교,285001052,1729,2020-08-31 22:06:03,351516,1,...,25.4,78.1,93.0,0.8,,,,,False,Monday
267100,56,2020-08-31 22:06:38,13,28,단국대치대병원,285001595,1729,2020-08-31 22:06:38,351517,1,...,25.4,78.1,93.0,0.8,,,,,False,Monday


In [66]:
bus_route_df = pd.read_csv('bus_route_distances2.csv')
bus_df_r = pd.merge(bus_df, bus_route_df[['MASK_SELECTED','previous']], on = 'MASK_SELECTED')

In [67]:
import pandas as pd

def preprocess_start(bus_df):
    # 시간 형식 변환
    bus_df['Parsed_Date'] = pd.to_datetime(bus_df['Parsed_Date'])
    
    # 시퀀스 ID 생성
    bus_df['sequence_id'] = (bus_df['MASK_SELECTED'] == 1).cumsum() - 1  # 1번 정류장이 시작 기준
    bus_df['sequence_id'] = bus_df['sequence_id'].fillna(method='ffill')  # NaN 채우기 (보완)

    # 시퀀스 내에서만 start_time 설정
    def set_start_time(group):
        # 1번 정류장 또는 29번 정류장의 Parsed_Date를 start_time으로 설정
        start_row = group[group['MASK_SELECTED'].isin([1, 29])]
        if not start_row.empty:
            start_time = start_row.iloc[0]['Parsed_Date']
            group['start_time'] = start_time
        return group

    # 시퀀스별로 start_time 설정
    bus_df = bus_df.groupby('sequence_id', group_keys=False).apply(set_start_time)
    
    # 이전 정류장의 도착 시간(prev_arrive_time) 계산
    bus_df['prev_arrive_time'] = bus_df['Parsed_Date'].shift(1)
    
    # 시퀀스의 첫 번째 정류장의 prev_arrive_time을 NaT로 설정
    sequence_starts = bus_df['MASK_SELECTED'].isin([1, 29])
    bus_df.loc[sequence_starts, 'prev_arrive_time'] = pd.NaT

    # 결측값 처리: prev_arrive_time이 NaT인 경우 해당 행의 start_time으로 대체
    bus_df['prev_arrive_time'] = bus_df['prev_arrive_time'].fillna(bus_df['start_time'])
    
    return bus_df

# 전처리 함수 실행
bus_df_r = preprocess_start(bus_df_r)

# 결과 확인
print(bus_df_r[['sequence_id', 'MASK_SELECTED', 'Parsed_Date', 'start_time', 'prev_arrive_time']].head())


  bus_df['sequence_id'] = bus_df['sequence_id'].fillna(method='ffill')  # NaN 채우기 (보완)


   sequence_id  MASK_SELECTED         Parsed_Date          start_time  \
0            0              1 2020-06-01 06:53:07 2020-06-01 06:53:07   
1            0              2 2020-06-01 06:53:33 2020-06-01 06:53:07   
2            0              3 2020-06-01 06:54:52 2020-06-01 06:53:07   
3            0              4 2020-06-01 06:55:44 2020-06-01 06:53:07   
4            0              5 2020-06-01 06:57:41 2020-06-01 06:53:07   

     prev_arrive_time  
0 2020-06-01 06:53:07  
1 2020-06-01 06:53:07  
2 2020-06-01 06:53:33  
3 2020-06-01 06:54:52  
4 2020-06-01 06:55:44  


  bus_df = bus_df.groupby('sequence_id', group_keys=False).apply(set_start_time)


In [68]:
bus_df_r

Unnamed: 0,MASK_SELECTED,Information_Occurrence,BUS_ROUTE,STOP_ORD,STOP_NAME,STOP_ID,Bus_num,Parsed_Date,original_index,up_down,...,상대습도(%)_8th,풍향(16방위)_8th,풍속(m/s)_8th,is_weekend,day_name,Date,previous,sequence_id,start_time,prev_arrive_time
0,1,2020-06-01 06:53:07,13,1,단대병원,285000640,1446,2020-06-01 06:53:07,0,0,...,,,,False,Monday,2020-06-01,,0,2020-06-01 06:53:07,2020-06-01 06:53:07
1,2,2020-06-01 06:53:33,13,2,단대병원,285000641,1446,2020-06-01 06:53:33,1,0,...,,,,False,Monday,2020-06-01,240.0,0,2020-06-01 06:53:07,2020-06-01 06:53:07
2,3,2020-06-01 06:54:52,13,3,상명대학교,285000642,1446,2020-06-01 06:54:52,2,0,...,,,,False,Monday,2020-06-01,724.0,0,2020-06-01 06:53:07,2020-06-01 06:53:33
3,4,2020-06-01 06:55:44,13,4,천안톨게이트,285000644,1446,2020-06-01 06:55:44,3,0,...,,,,False,Monday,2020-06-01,759.0,0,2020-06-01 06:53:07,2020-06-01 06:54:52
4,5,2020-06-01 06:57:41,13,5,도솔광장,285001062,1446,2020-06-01 06:57:41,4,0,...,,,,False,Monday,2020-06-01,871.0,0,2020-06-01 06:53:07,2020-06-01 06:55:44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351514,53,2020-08-31 22:02:25,13,25,대림한숲아파트,285000706,1729,2020-08-31 22:02:25,351514,1,...,,,,False,Monday,2020-08-31,359.0,6166,2020-08-31 20:38:08,2020-08-31 22:01:46
351515,54,2020-08-31 22:04:53,13,26,천안톨게이트,285000643,1729,2020-08-31 22:04:53,351515,1,...,,,,False,Monday,2020-08-31,1340.0,6166,2020-08-31 20:38:08,2020-08-31 22:02:25
351516,55,2020-08-31 22:06:03,13,27,상명대학교,285001052,1729,2020-08-31 22:06:03,351516,1,...,,,,False,Monday,2020-08-31,910.0,6166,2020-08-31 20:38:08,2020-08-31 22:04:53
351517,56,2020-08-31 22:06:38,13,28,단국대치대병원,285001595,1729,2020-08-31 22:06:38,351517,1,...,,,,False,Monday,2020-08-31,307.0,6166,2020-08-31 20:38:08,2020-08-31 22:06:03


In [69]:
bus_df_r = bus_df_r.rename(columns={'기온(℃)': 'temperature', '상대습도(%)': 'Relative_Humidity', '풍향(16방위)' : 'wind_d', '풍속(m/s)': 'wind_s'})
bus_df = bus_df_r[['MASK_SELECTED', 'LAT', 'LNG', 'STOP_ID', 'Parsed_Date', 'up_down', 'temperature', 'Relative_Humidity', 'is_weekend', 'day_name', 'previous', 'wind_d', 'wind_s', 'Bus_num', 'prev_arrive_time', 'start_time']]

In [70]:
bus_df = bus_df.fillna(0)
nan_counts_per_column = bus_df.isna().sum()
print(nan_counts_per_column)

MASK_SELECTED        0
LAT                  0
LNG                  0
STOP_ID              0
Parsed_Date          0
up_down              0
temperature          0
Relative_Humidity    0
is_weekend           0
day_name             0
previous             0
wind_d               0
wind_s               0
Bus_num              0
prev_arrive_time     0
start_time           0
dtype: int64


In [71]:
bus_df['sequence_id'] = (bus_df['MASK_SELECTED'] == 1).cumsum() - 1
def find_incomplete_sequences(df, min_value=1, max_value=57):
    """
    1~57 사이의 모든 값을 포함하지 않는 시퀀스를 반환합니다.

    Parameters:
    - df (pd.DataFrame): 시퀀스 데이터를 포함한 데이터프레임
    - min_value (int): 누락 확인할 값의 최소값 (기본값 1)
    - max_value (int): 누락 확인할 값의 최대값 (기본값 57)

    Returns:
    - incomplete_sequences (list): 1~57을 모두 포함하지 않는 시퀀스 ID 리스트
    """
    incomplete_sequences = []

    # 시퀀스별로 그룹화
    grouped = df.groupby('sequence_id')

    for sequence_id, group in grouped:
        # 현재 시퀀스에 존재하는 값들
        present_values = set(group['MASK_SELECTED'])
        # 1~57 사이의 값이 모두 있는지 확인
        if not set(range(min_value, max_value + 1)).issubset(present_values):
            incomplete_sequences.append(sequence_id)

    return incomplete_sequences

# 누락된 값이 있는 시퀀스 찾기
incomplete_sequences = find_incomplete_sequences(bus_df)

# 결과 출력
print(f"Incomplete sequences: {incomplete_sequences}")
print(len(incomplete_sequences))

Incomplete sequences: []
0


In [72]:
bus_df= bus_df.drop_duplicates()

In [73]:
bus_df['sequence_id'] = (bus_df['MASK_SELECTED'] == 1).cumsum() - 1
def find_incomplete_sequences(df, min_value=1, max_value=57):
    """
    1~57 사이의 모든 값을 포함하지 않는 시퀀스를 반환합니다.

    Parameters:
    - df (pd.DataFrame): 시퀀스 데이터를 포함한 데이터프레임
    - min_value (int): 누락 확인할 값의 최소값 (기본값 1)
    - max_value (int): 누락 확인할 값의 최대값 (기본값 57)

    Returns:
    - incomplete_sequences (list): 1~57을 모두 포함하지 않는 시퀀스 ID 리스트
    """
    incomplete_sequences = []

    # 시퀀스별로 그룹화
    grouped = df.groupby('sequence_id')

    for sequence_id, group in grouped:
        # 현재 시퀀스에 존재하는 값들
        present_values = set(group['MASK_SELECTED'])
        # 1~57 사이의 값이 모두 있는지 확인
        if not set(range(min_value, max_value + 1)).issubset(present_values):
            incomplete_sequences.append(sequence_id)

    return incomplete_sequences

# 누락된 값이 있는 시퀀스 찾기
incomplete_sequences = find_incomplete_sequences(bus_df)

# 결과 출력
print(f"Incomplete sequences: {incomplete_sequences}")
print(len(incomplete_sequences))

Incomplete sequences: []
0


In [74]:
bus_df.to_csv('bus_df.csv', index = False)

In [75]:
import pandas as pd

# Ensure `Parsed_Date` and `start_time` are in datetime format
bus_df['Parsed_Date'] = pd.to_datetime(bus_df['Parsed_Date'])
bus_df['start_time'] = pd.to_datetime(bus_df['start_time'])

# 상행/하행 데이터 나누기
up_route = bus_df[bus_df['MASK_SELECTED'].between(1, 28)].copy()
down_route = bus_df[bus_df['MASK_SELECTED'].between(29, 57)].copy()

# 상행과 하행 각각에서 sequence_id 생성
up_route['sequence_id'] = (up_route['MASK_SELECTED'] == 1).cumsum() - 1
down_route['sequence_id'] = (down_route['MASK_SELECTED'] == 29).cumsum() - 1

# 각 시퀀스별로 first_stop_time 계산
def calculate_first_stop_time(df):
    # 각 시퀀스의 첫 번째 정류장 기준 시간 가져오기
    first_stop_times = df.loc[df['MASK_SELECTED'].isin([1, 29])].groupby('sequence_id')['Parsed_Date'].first()
    df = df.merge(first_stop_times.rename('first_stop_time'), on='sequence_id')
    return df

up_route = calculate_first_stop_time(up_route)
down_route = calculate_first_stop_time(down_route)

# 시퀀스별로 그룹화하여 누적 시간 계산
def calculate_time_from_first_stop(df):
    # 시퀀스별로 time_from_start 계산
    df['time_from_start'] = df.groupby('sequence_id').apply(
        lambda group: (group['Parsed_Date'] - group['first_stop_time']).dt.total_seconds()
    ).reset_index(drop=True)
    return df

# 상행/하행 각각 처리
up_route = calculate_time_from_first_stop(up_route)
down_route = calculate_time_from_first_stop(down_route)

# 상행과 하행 합치기
df = pd.concat([up_route, down_route]).sort_values(['sequence_id', 'MASK_SELECTED'])


  df['time_from_start'] = df.groupby('sequence_id').apply(
  df['time_from_start'] = df.groupby('sequence_id').apply(


In [76]:
df.to_csv('bus_df.csv', index = False)

In [77]:
import pandas as pd
# 인덱스를 추가하여 원래 순서를 유지
bus_df = pd.read_csv('bus_df.csv')

bus_df['Parsed_Date'] = bus_df.Parsed_Date.apply(parse_time)

# MASK_SELECTED 값이 1~28인 데이터 필터링
df_up = bus_df[(bus_df['MASK_SELECTED'] >= 1) & (bus_df['MASK_SELECTED'] <= 28)]

# MASK_SELECTED 값이 29~57인 데이터 추출
df_down = bus_df[(bus_df['MASK_SELECTED'] >= 29) & (bus_df['MASK_SELECTED'] <= 57)]

# travel_time 열 추가 (1~28 구간)
df_up['travel_time'] = df_up['Parsed_Date'].diff().dt.total_seconds()
df_up.loc[df_up['MASK_SELECTED'] == 1, 'travel_time'] = 0

# travel_time 열 추가 (29~57 구간)
df_down['travel_time'] = df_down['Parsed_Date'].diff().dt.total_seconds()
df_down.loc[df_down['MASK_SELECTED'] == 29, 'travel_time'] = 0

bus_df.to_csv('bus_df.csv', index=False)

# 각각의 데이터프레임을 csv 파일로 저장
df_up.to_csv('13_bus_up2.csv', index=False)
df_down.to_csv('13_bus_down2.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_up['travel_time'] = df_up['Parsed_Date'].diff().dt.total_seconds()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_down['travel_time'] = df_down['Parsed_Date'].diff().dt.total_seconds()


In [78]:
df_down

Unnamed: 0,MASK_SELECTED,LAT,LNG,STOP_ID,Parsed_Date,up_down,temperature,Relative_Humidity,is_weekend,day_name,previous,wind_d,wind_s,Bus_num,prev_arrive_time,start_time,sequence_id,first_stop_time,time_from_start,travel_time
28,29,36.794310,127.103680,288002022,2020-06-01 07:41:58,0,25.4,78.1,False,Monday,87.0,93.0,0.8,1446,2020-06-01 06:53:07,2020-06-01 06:53:07,0,2020-06-01 07:41:58,0.0,0.0
29,30,36.793720,127.101300,288002048,2020-06-01 07:44:15,1,25.4,78.1,False,Monday,697.0,93.0,0.8,1446,2020-06-01 07:41:58,2020-06-01 06:53:07,0,2020-06-01 07:41:58,137.0,137.0
30,31,36.791500,127.103600,288002219,2020-06-01 07:44:49,1,25.4,78.1,False,Monday,420.0,93.0,0.8,1446,2020-06-01 07:44:15,2020-06-01 06:53:07,0,2020-06-01 07:41:58,171.0,34.0
31,32,36.791010,127.107800,288002054,2020-06-01 07:46:10,1,25.4,78.1,False,Monday,501.0,93.0,0.8,1446,2020-06-01 07:44:49,2020-06-01 06:53:07,0,2020-06-01 07:41:58,252.0,81.0
32,33,36.786500,127.108700,288002056,2020-06-01 07:47:21,1,25.4,78.1,False,Monday,506.0,93.0,0.8,1446,2020-06-01 07:46:10,2020-06-01 06:53:07,0,2020-06-01 07:41:58,323.0,71.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
351514,53,36.819375,127.159581,285000706,2020-08-31 22:02:25,1,25.4,78.1,False,Monday,359.0,93.0,0.8,1729,2020-08-31 22:01:46,2020-08-31 20:38:08,6166,2020-08-31 21:28:52,2013.0,39.0
351515,54,36.828243,127.167702,285000643,2020-08-31 22:04:53,1,25.4,78.1,False,Monday,1340.0,93.0,0.8,1729,2020-08-31 22:02:25,2020-08-31 20:38:08,6166,2020-08-31 21:28:52,2161.0,148.0
351516,55,36.834283,127.173954,285001052,2020-08-31 22:06:03,1,25.4,78.1,False,Monday,910.0,93.0,0.8,1729,2020-08-31 22:04:53,2020-08-31 20:38:08,6166,2020-08-31 21:28:52,2231.0,70.0
351517,56,36.837038,127.174354,285001595,2020-08-31 22:06:38,1,25.4,78.1,False,Monday,307.0,93.0,0.8,1729,2020-08-31 22:06:03,2020-08-31 20:38:08,6166,2020-08-31 21:28:52,2266.0,35.0
