資料處理（按日期跟航班代碼排序）

In [11]:
import os
import pandas as pd

def load_and_merge_files(start_date, end_date, base_path):
    dates = pd.date_range(start=f'2024-{start_date[:2]}-{start_date[2:]}', 
                          end=f'2024-{end_date[:2]}-{end_date[2:]}')
    date_strings = [date.strftime('%m%d') for date in dates]

    data_frames = []
    for date in date_strings:
        file_path = f'{base_path}/tokyo_{date}.csv'
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            if not df.empty:
                df = df.dropna(how="all")
                
                # 格式化出發日期並去除時間
                df['出發日期'] = pd.to_datetime(df['出發日期'].str.extract(r'(\d{4}-\d{2}-\d{2})')[0]).dt.strftime('%Y-%m-%d')
                
                # 新增星期欄位
                df['星期'] = pd.to_datetime(df['出發日期']).dt.day_name(locale='zh_TW')
                
                # 移除價格中的符號並轉換為數字
                df = df[df['價格'].str.match(r'^[NT\$,\d\s]+$')]  # 過濾掉不符合數字格式的行
                df['價格'] = df['價格'].replace(r'[NT\$,\s]', '', regex=True).astype(int)
                
                # 計算 day left
                file_date = pd.to_datetime(f"2024-{date[:2]}-{date[2:]}")
                df['day left'] = (pd.to_datetime(df['出發日期']) - file_date).dt.days
                
                # 調整欄位順序，將「星期」移到「出發時間」後面
                cols = df.columns.tolist()
                cols.insert(cols.index('出發時間') + 1, cols.pop(cols.index('星期')))
                df = df[cols]
                
                data_frames.append(df)

    merged_data = pd.concat(data_frames, ignore_index=True) if data_frames else pd.DataFrame()
    return merged_data

def save_csv_with_headers(data, output_path):
    header = list(data.columns)
    previous_flight_code = None

    with open(output_path, 'w', encoding='utf-8-sig') as f:
        for _, row in data.iterrows():
            if pd.notna(row['出發日期']) and pd.notna(row['航班代碼']):
                if row['航班代碼'] != previous_flight_code:
                    f.write(','.join(header) + '\n')
                    previous_flight_code = row['航班代碼']
                f.write(','.join([str(item) for item in row]) + '\n')

# 使用範例
base_path = '/Users/yuchingchen/Documents/專題/data'
merged_data = load_and_merge_files('1021', '1111', base_path)

grouped_data = merged_data.sort_values(by=['出發日期', '航班代碼']).reset_index(drop=True)

output_folder = '/Users/yuchingchen/Documents/專題/cleaned_data'
os.makedirs(output_folder, exist_ok=True)

output_path = f'{output_folder}/tokyo.csv'
save_csv_with_headers(grouped_data, output_path)
print(f"分組並排序後的資料已輸出到 {output_path}")

分組並排序後的資料已輸出到 /Users/yuchingchen/Documents/專題/cleaned_data/tokyo.csv
