In [3]:
import pandas as pd
import icalendar
import re
import os
from datetime import datetime

In [4]:
def load_data(file_path, encode = 'gbk'):
    df = pd.read_csv(file_path ,encoding=encode)
    columns = df.columns
    df.fillna(df.mean(numeric_only=True) ,inplace=True)
    return df

In [5]:
# 指定文件路径
class_file_path = '../temp/final'
data_file_path = 'synth_data'
id = 'chen'

#生成课表路径
spring_path = os.path.join(class_file_path, id, '2023_spring.csv')
autumn_path = os.path.join(class_file_path, id, '2022_autumn.csv')
summer_path = os.path.join(class_file_path, id, '2022_summer.csv')

#生成data路径
data_path = os.path.join(data_file_path, id, 'base.csv')

In [6]:
data = load_data(data_path, encode='utf-8')
spring = load_data(spring_path)
summer = load_data(summer_path)
autumn = load_data(autumn_path)

In [7]:
def week2num(df):
    week = []
    for row in df:
        if row == 'Monday':
            week.append(1)
        elif row == 'Tuesday':
            week.append(2)
        elif row == 'Wednesday':
            week.append(3)
        elif row == 'Thursday':
            week.append(4)
        elif row == 'Friday':
            week.append(5)
        elif row == 'Saturday':
            week.append(6)
        elif row == 'Sunday':
            week.append(7)
    return week

spring_week = week2num(spring['Weekday'])
autumn_week = week2num(autumn['Weekday'])
summer_week = week2num(summer['Weekday'])

spring['Weekday'] = spring_week
autumn['Weekday'] = autumn_week
summer['Weekday'] = summer_week

In [8]:
data['饭点课程_早'] = 0
data['饭点课程_午'] = 0
data['饭点课程_晚'] = 0

In [11]:
class_list = pd.concat([spring, autumn], copy=True)
class_list = pd.concat([class_list, summer], copy=True)

# 将日期字符串转换为日期格式
data['交易日期'] = pd.to_datetime(data['交易日期'], format="%Y/%m/%d")
class_list['Start Date'] = pd.to_datetime(class_list['Start Date'], format="%Y/%m/%d")
class_list['End Date'] = pd.to_datetime(class_list['End Date'], format="%Y/%m/%d")

# 使用条件索引进行高效筛选和修改
for _, row_c in class_list.iterrows():
    mask = (data['交易日期'].between(row_c['Start Date'], row_c['End Date'])) & (data['星期'] == row_c['Weekday'])
    if row_c['Start Time'] == '8:00':
        data.loc[mask, '饭点课程_早'] = 1
    if row_c['End Time'] == '11:45':
        data.loc[mask, '饭点课程_午'] = 1
    if row_c['End Time'] == '17:25':
        data.loc[mask, '饭点课程_晚'] = 1

In [12]:
map = load_data('../数据处理/map.csv')

# 将商户编号映射到原始数据框 df 中，使用 '商户名称' 列作为合并键
df = data.merge(map, on='商户名称', how='left')

column_mapping = {
    '交易额': 'amount',
    '商户名称': 'merchant_name',
    '交易日期': 'date',
    '交易时间': 'time',
    '星期': 'weekday',
    '节假日': 'holiday',
    '气温': 'temperature',
    '气象站大气压': 'station_pressure',
    '海平面大气压': 'sea_level_pressure',
    '相对湿度': 'relative_humidity',
    '平均风速': 'windspeed',
    '特殊天象': 'special_weather',
    '时间戳': 'timestamp',
    '饭点课程_早': 'meal_course_breakfast',
    '饭点课程_午': 'meal_course_lunch',
    '饭点课程_晚': 'meal_course_dinner',
    '商户编号': 'merchant_code'
}

# 使用rename()方法重命名DataFrame的列
df.rename(columns=column_mapping, inplace=True)
df

Unnamed: 0,amount,merchant_name,date,time,weekday,holiday,temperature,station_pressure,sea_level_pressure,relative_humidity,windspeed,special_weather,timestamp,meal_course_breakfast,meal_course_lunch,meal_course_dinner,merchant_code
0,10.00,民族餐厅山西饸饹面,2023-04-25,18:15:00,2,0,2,744.0,756.9,100,3,0,1.682446e+09,1,1,1,10
1,3.65,一区饺子园凉菜组,2023-04-25,11:09:00,2,0,9,742.0,754.6,62,8,0,1.682421e+09,1,1,1,46
2,36.00,美食林川蜀源烤鱼,2023-04-25,09:37:00,2,0,12,741.5,753.9,35,6,0,1.682415e+09,1,1,1,1
3,13.00,学苑食堂香恋香拌砂锅,2023-04-24,17:45:00,1,0,13,750.2,762.8,16,3,0,1.682358e+09,0,1,0,9
4,16.60,美食林肠粉,2023-04-24,11:53:00,1,0,14,754.0,766.6,14,7,0,1.682337e+09,0,1,0,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,7.00,学苑食堂副食三组,2022-07-05,18:11:00,2,0,26,741.2,753.1,89,2,0,1.657045e+09,0,1,1,24
212,30.50,阳光川蜀源烤鱼,2022-07-05,11:50:00,2,0,28,742.1,753.9,74,4,0,1.657022e+09,0,1,1,37
213,6.00,学苑食堂副食三组,2022-07-04,11:58:00,1,0,27,741.3,753.1,74,4,0,1.656936e+09,0,0,0,24
214,8.00,阳光广式猪脚饭,2022-07-03,12:12:00,7,1,28,742.1,753.9,70,6,0,1.656850e+09,0,0,0,8


In [13]:
save_name = 'base.csv'
save_path = os.path.join('synth_data', id, save_name)
df.to_csv(save_path , sep= ',', encoding='utf-8',index=False)