In [19]:
import pandas as pd
import numpy as np
import datetime
import time
import random
import os

from chinese_calendar import is_holiday
from datetime import datetime, timedelta

In [22]:
def load_data(file_path, encode = 'gbk'):
    df = pd.read_csv(file_path ,encoding=encode)
    columns = df.columns
    df.fillna(df.mean(numeric_only=True) ,inplace=True)
    return df

id = 'chen'
file_name = 'chen_utf_decode_syn_1.csv'
file_path = os.path.join('synth_data', id, file_name)
df = load_data(file_path, encode='utf-8')

In [23]:
# 将日期和时间合并为一个datetime列，并按日期进行排序
df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])
df = df.sort_values(by='datetime', ascending=False).drop(['date', 'time'], axis=1)

In [24]:
# 按日期分组，并计算每天的数据条数
grouped = df.groupby(df['datetime'].dt.date).size().reset_index(name='count')

In [25]:
# 处理数据
for index, row in grouped.iterrows():
    date = row['datetime']
    count = row['count']
    
    # 获取当天的所有数据
    day_data = df[df['datetime'].dt.date == date]

    # 按时间段随机保留一条数据
    time_slots = ['05:00-10:00', '10:00-15:00', '15:00-19:00', '19:00-23:00']
    for slot in time_slots:
        slot_data = day_data[day_data['datetime'].dt.strftime('%H:%M').between(slot[:5], slot[6:])]
        if len(slot_data) > 1:
            # 随机选择一条数据保留
            index_to_keep = random.choice(slot_data.index)
            df = df.drop(slot_data.index.drop(index_to_keep))

In [26]:
# 过滤掉“00:00-09:00”时间段中消费大于10元的数据
filtered_df = df.copy()

time_slot = filtered_df['datetime'].dt.strftime('%H:%M').between('05:00', '09:00')
amount_greater = filtered_df['amount'] > 10

filtered_df = filtered_df[~(time_slot & amount_greater)]

In [27]:
# 提取日期和时间信息
filtered_df['datetime'] = pd.to_datetime(filtered_df['datetime'])
filtered_df['date'] = filtered_df['datetime'].dt.date
filtered_df['time'] = filtered_df['datetime'].dt.time

# 删除原来的datetime列
filtered_df.drop('datetime', axis=1, inplace=True)

In [28]:
# 添加"week"列
filtered_df['date'] = pd.to_datetime(filtered_df['date'])
filtered_df['week'] = filtered_df['date'].dt.weekday + 1  # 星期一为1，星期日为7

In [29]:
holiday = []

for date in filtered_df['date'] :
    if is_holiday(date):
        holiday.append(1)
    else :
        holiday.append(0)

filtered_df['holiday'] = holiday

In [30]:
trade_times = filtered_df['time'].astype(str)

processed_times = []

for time_str in trade_times:
    # 将时间字符串转换为datetime对象
    dt = datetime.strptime(time_str, "%H:%M:%S")
    # 如果分钟大于等于30，则小时进一位，并将分钟置为0
    if dt.minute >= 30:
        dt = dt.replace(hour=dt.hour + 1, minute=0)
    else:
        dt = dt.replace(minute=0)
    # 将处理后的时间添加到列表中
    processed_times.append(dt.strftime("%H:%M"))

filtered_df['时间_1'] = processed_times

In [31]:
climate = pd.read_csv('../temp/climate.csv' ,encoding='gbk')
climate['时间'] = climate['时间'].str.zfill(5)
filtered_df['datetime'] = filtered_df['date'].astype(str) + ' ' + filtered_df['时间_1'].astype(str).str[:2]

In [32]:
climate['date'] = pd.to_datetime(climate['date'])
climate['datetime'] = climate['date'].astype(str) + ' ' + climate['时间'].str[:2]
climate.rename(columns={'date': '日期'}, inplace=True)

In [33]:
merged_df = pd.merge(filtered_df, climate, on='datetime')
merged_df = merged_df.drop(['datetime','日期','时间','重要天象','时间_1'],axis=1)

In [34]:
# 将日期和时间合并为新的datetime列
merged_df['timestamp'] = pd.to_datetime(merged_df['date'].astype(str) + ' ' + merged_df['time'].astype(str))

# 将datetime转换为浮点数时间戳
merged_df['timestamp'] = merged_df['timestamp'].apply(lambda x: x.timestamp())

In [35]:
w_list = []
weather = merged_df['特殊天象'].values

for w in weather:
    if str(w) == 'nan':
        w_list.append(0)
    else:
        w_list.append(1)
merged_df['特殊天象'] = w_list

In [36]:
column_mapping = {
    'amount': '交易额',
    'merchant_name': '商户名称',
    'date': '交易日期',
    'time': '交易时间',
    'week': '星期',
    'holiday': '节假日',
    'timestamp': '时间戳'
}

# 使用rename()方法重命名DataFrame的列
merged_df.rename(columns=column_mapping, inplace=True)

In [37]:
merged_df['交易日期'] = pd.to_datetime(merged_df['交易日期'])
merged_df['交易日期'] = merged_df['交易日期'].dt.strftime("%Y/%m/%d")
merged_df

Unnamed: 0,交易额,商户名称,交易日期,交易时间,星期,节假日,气温,气象站大气压,海平面大气压,相对湿度,平均风速,特殊天象,时间戳
0,10.00,民族餐厅山西饸饹面,2023/04/25,18:15:00,2,0,2,744.0,756.9,100,3,0,1.682446e+09
1,3.65,一区饺子园凉菜组,2023/04/25,11:09:00,2,0,9,742.0,754.6,62,8,0,1.682421e+09
2,36.00,美食林川蜀源烤鱼,2023/04/25,09:37:00,2,0,12,741.5,753.9,35,6,0,1.682415e+09
3,13.00,学苑食堂香恋香拌砂锅,2023/04/24,17:45:00,1,0,13,750.2,762.8,16,3,0,1.682358e+09
4,16.60,美食林肠粉,2023/04/24,11:53:00,1,0,14,754.0,766.6,14,7,0,1.682337e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,7.00,学苑食堂副食三组,2022/07/05,18:11:00,2,0,26,741.2,753.1,89,2,0,1.657045e+09
212,30.50,阳光川蜀源烤鱼,2022/07/05,11:50:00,2,0,28,742.1,753.9,74,4,0,1.657022e+09
213,6.00,学苑食堂副食三组,2022/07/04,11:58:00,1,0,27,741.3,753.1,74,4,0,1.656936e+09
214,8.00,阳光广式猪脚饭,2022/07/03,12:12:00,7,1,28,742.1,753.9,70,6,0,1.656850e+09


In [38]:
save_name = 'base.csv'
save_path = os.path.join('synth_data', id, save_name)
merged_df.to_csv(save_path , sep= ',', encoding='utf-8',index=False)