In [1]:
#导入必要的包
import pandas as pd
import numpy as np
import datetime
import icalendar
import re
import time
import random
import os
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns

from chinese_calendar import is_holiday
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [2]:
'''
函数名：load_data
功  能：读取文件，并以DataFrame形式载入数据
参  数：file_path :文件路径
        encode :编码方式，默认‘gbk’
输  出：df:DataFrame格式的文件数据

'''
def load_data(file_path, encode = 'gbk'):
    df = pd.read_csv(file_path ,encoding=encode)
    columns = df.columns
    df.fillna(df.mean(numeric_only=True) ,inplace=True)
    return df

In [3]:
'''
函数名：remove_data
功  能：清洗生成数据中'05:00-10:00', '10:00-15:00', '15:00-19:00', '19:00-23:00'
        四个时段的数据，使一天的用餐数据不超过4条
参  数：df:DataFrame格式的文件数据
输  出：filtered_df:DataFrame格式的文件数据

'''

def remove_multi(df):
    # 将日期和时间合并为一个datetime列，并按日期进行排序
    df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])
    df = df.sort_values(by='datetime', ascending=False).drop(['date', 'time'], axis=1)
    
    # 按日期分组，并计算每天的数据条数
    grouped = df.groupby(df['datetime'].dt.date).size().reset_index(name='count')
    
    # 处理数据
    for index, row in grouped.iterrows():
        date = row['datetime']
        count = row['count']

        # 获取当天的所有数据
        day_data = df[df['datetime'].dt.date == date]

        # 按时间段随机保留一条数据
        time_slots = ['05:00-10:00', '10:00-15:00', '15:00-19:00', '19:00-23:00']
        for slot in time_slots:
            slot_data = day_data[day_data['datetime'].dt.strftime('%H:%M').between(slot[:5], slot[6:])]
            if len(slot_data) > 1:
                # 随机选择一条数据保留
                index_to_keep = random.choice(slot_data.index)
                df = df.drop(slot_data.index.drop(index_to_keep))
                
    # 过滤掉“00:00-09:00”时间段中消费大于10元的数据
    filtered_df = df.copy()

    time_slot = filtered_df['datetime'].dt.strftime('%H:%M').between('05:00', '09:00')
    amount_greater = filtered_df['amount'] > 10

    filtered_df = filtered_df[~(time_slot & amount_greater)]
    
    return filtered_df

In [4]:
'''
函数名：add_col
功  能：根据datetime添加week和holiday特征列
参  数：df:DataFrame格式的文件数据
输  出：filtered_df:DataFrame格式的文件数据

'''

def add_col(df):
    filtered_df = df.copy()
    
    # 提取日期和时间信息
    filtered_df['datetime'] = pd.to_datetime(filtered_df['datetime'])
    filtered_df['date'] = filtered_df['datetime'].dt.date
    filtered_df['time'] = filtered_df['datetime'].dt.time

    # 添加"week"列
    filtered_df['date'] = pd.to_datetime(filtered_df['date'])
    filtered_df['week'] = filtered_df['date'].dt.weekday + 1  # 星期一为1，星期日为7
    
    #添加"holiday"列
    holiday = []

    for date in filtered_df['date'] :
        if is_holiday(date):
            holiday.append(1)
        else :
            holiday.append(0)

    filtered_df['holiday'] = holiday
    return filtered_df

In [5]:
'''
函数名：add_weather
功  能：加入climate天气数据
参  数：df:DataFrame格式的文件数据
输  出：filtered_df:DataFrame格式的文件数据

'''

def add_weather(df):
    filtered_df = df.copy()
    
    trade_times = filtered_df['time'].astype(str)
    processed_times = []

    for time_str in trade_times:
        # 将时间字符串转换为datetime对象
        dt = datetime.strptime(time_str, "%H:%M:%S")
        # 如果分钟大于等于30，则小时进一位，并将分钟置为0
        if dt.minute >= 30:
            dt = dt.replace(hour=dt.hour + 1, minute=0)
        else:
            dt = dt.replace(minute=0)
        # 将处理后的时间添加到列表中
        processed_times.append(dt.strftime("%H:%M"))

    filtered_df['时间_1'] = processed_times
    filtered_df['datetime'] = filtered_df['date'].astype(str) + ' ' + filtered_df['时间_1'].astype(str).str[:2]
    
    #读入climate天气数据
    climate = pd.read_csv('../temp/climate.csv' ,encoding='gbk')

    #处理climate使其datetime特征与filtered_df对应
    climate['时间'] = climate['时间'].str.zfill(5)
    climate['date'] = pd.to_datetime(climate['date'])
    climate['datetime'] = climate['date'].astype(str) + ' ' + climate['时间'].str[:2]
    climate.rename(columns={'date': '日期'}, inplace=True)
    
    #合并数据
    filtered_df = pd.merge(filtered_df, climate, on='datetime')
    filtered_df = filtered_df.drop(['datetime','日期','时间','重要天象','时间_1'],axis=1)
    
    #将特殊天气转换为0/1
    w_list = []
    weather = filtered_df['特殊天象'].values

    for w in weather:
        if str(w) == 'nan':
            w_list.append(0)
        else:
            w_list.append(1)
    filtered_df['特殊天象'] = w_list
    return filtered_df

In [6]:
'''
函数名：transform_time
功  能：根据date和time特征，生成时间戳timestamp列，并将date转换为%Y-%m-%d格式
参  数：df:DataFrame格式的文件数据
输  出：filtered_df:DataFrame格式的文件数据

'''

def transform_time(df):
    filtered_df = df.copy()
    
    # 将日期和时间合并为新的datetime列
    filtered_df['timestamp'] = pd.to_datetime(filtered_df['date'].astype(str) + ' ' + filtered_df['time'].astype(str))

    # 将datetime转换为浮点数时间戳
    filtered_df['timestamp'] = filtered_df['timestamp'].apply(lambda x: x.timestamp())
    
    column_mapping = {
        'amount': '交易额',
        'merchant_name': '商户名称',
        'date': '交易日期',
        'time': '交易时间',
        'week': '星期',
        'holiday': '节假日',
        'timestamp': '时间戳'
    }

    # 使用rename()方法重命名DataFrame的列
    filtered_df.rename(columns=column_mapping, inplace=True)
    
    filtered_df['交易日期'] = pd.to_datetime(filtered_df['交易日期'])
    filtered_df['交易日期'] = filtered_df['交易日期'].dt.strftime("%Y/%m/%d")
    return filtered_df

In [7]:
'''
函数名：add_course
功  能：加入course课程数据
参  数：df:DataFrame格式的文件数据
        id:生成数据的类别
输  出：filtered_df:DataFrame格式的文件数据

'''

def add_course(df, id):
    filtered_df = df.copy()
    
    course_folder = '../temp/final'
    course_folder = os.path.join(course_folder, id)
    
    #空的DataFrame
    course_df = pd.DataFrame()
    
    # 循环读取课表文件并拼接
    for coursename in os.listdir(course_folder):
        if coursename.endswith(".csv") and coursename[0].isdigit():  # 以数字开头且为CSV文件
            coursepath = os.path.join(course_folder, coursename)
            course = load_data(coursepath)
            course_df = pd.concat([course_df, course])
    
    #将课表中的Weekday数字化       
    week = []
    for row in course_df['Weekday']:
        if row == 'Monday':
            week.append(1)
        elif row == 'Tuesday':
            week.append(2)
        elif row == 'Wednesday':
            week.append(3)
        elif row == 'Thursday':
            week.append(4)
        elif row == 'Friday':
            week.append(5)
        elif row == 'Saturday':
            week.append(6)
        elif row == 'Sunday':
            week.append(7)
    course_df['Weekday'] = week
    
    # 将日期字符串转换为日期格式
    filtered_df['交易日期'] = pd.to_datetime(filtered_df['交易日期'], format="%Y/%m/%d")
    course_df['Start Date'] = pd.to_datetime(course_df['Start Date'], format="%Y-%m-%d")
    course_df['End Date'] = pd.to_datetime(course_df['End Date'], format="%Y-%m-%d")
    
    #添加课程特征列，默认值为0
    filtered_df['饭点课程_早'] = 0
    filtered_df['饭点课程_午'] = 0
    filtered_df['饭点课程_晚'] = 0
    
    # 使用条件索引进行高效筛选和修改
    for _, row_c in course_df.iterrows():
        mask = (filtered_df['交易日期'].between(row_c['Start Date'], row_c['End Date'])) & (filtered_df['星期'] == row_c['Weekday'])
        if row_c['Start Time'] == '8:00' or row_c['Start Time'] == '08:00':
            filtered_df.loc[mask, '饭点课程_早'] = 1
        if row_c['End Time'] == '11:45':
            filtered_df.loc[mask, '饭点课程_午'] = 1
        if row_c['End Time'] == '17:25' or row_c['End Time'] == '17:30':
            filtered_df.loc[mask, '饭点课程_晚'] = 1
            
    return filtered_df

In [8]:
'''
函数名：transform_merchant
功  能：添加merchant_code列，将商户名称和编号一一对应，同时将所有列名转换为英文
参  数：df:DataFrame格式的文件数据
输  出：filtered_df:DataFrame格式的文件数据

'''

def transform_merchant(df):
    filtered_df = df.copy()
    
    #读取map映射文件
    map_path = '../数据处理/map.csv'
    map = load_data(map_path)
    
    # 将商户编号映射到原始数据框 df 中，使用 '商户名称' 列作为合并键
    filtered_df = filtered_df.merge(map, on='商户名称', how='left')

    column_mapping = {
        '交易额': 'amount',
        '商户名称': 'merchant_name',
        '交易日期': 'date',
        '交易时间': 'time',
        '星期': 'weekday',
        '节假日': 'holiday',
        '气温': 'temperature',
        '气象站大气压': 'station_pressure',
        '海平面大气压': 'sea_level_pressure',
        '相对湿度': 'relative_humidity',
        '平均风速': 'windspeed',
        '特殊天象': 'special_weather',
        '时间戳': 'timestamp',
        '饭点课程_早': 'meal_course_breakfast',
        '饭点课程_午': 'meal_course_lunch',
        '饭点课程_晚': 'meal_course_dinner',
        '商户编号': 'merchant_code'
    }

    # 使用rename()方法重命名DataFrame的列
    filtered_df.rename(columns=column_mapping, inplace=True)
    return filtered_df

In [9]:
'''
函数名：draw_plot
功  能：将部分数据归一化，并绘制plot折线图，存贮在img目录下
参  数：df:DataFrame格式的文件数据
        id:生成数据的类别
        output_folder:存储图片的路径
        file_name:处理文件的名称
输  出：synth_df:DataFrame格式的文件数据

'''

def draw_plot(df, id, output_folder, file_name):
    synth_df = df.copy()
    
    #获取真实数据
    real_folder = '../temp/final'
    real_path = os.path.join(real_folder, id, 'final(english).csv')
    real_df = load_data(real_path)
    
    # 选择需要归一化的列（如果有多个列）
    columns_to_normalize = ['amount', 'temperature', 'station_pressure', 'sea_level_pressure', 'relative_humidity', 'windspeed',
                           'timestamp']

    # 初始化 MinMaxScaler
    real_scaler = MinMaxScaler()
    synth_scaler = MinMaxScaler()

    # 对需要归一化的列进行归一化操作
    real_df[columns_to_normalize] = real_scaler.fit_transform(real_df[columns_to_normalize])
    synth_df[columns_to_normalize] = synth_scaler.fit_transform(synth_df[columns_to_normalize])
    
    cols = ["amount", "temperature", "station_pressure", "sea_level_pressure", 
            "relative_humidity", "windspeed","special_weather", "timestamp"
    ]

    real_data_plot = real_df[cols]
    synth_data_plot = synth_df[cols]
    
    fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(15, 10))
    axes=axes.flatten()

    for j, col in enumerate(cols):
        df = pd.DataFrame({'Real': real_data_plot.iloc[:20, j],
                       'Synthetic': synth_data_plot.iloc[:20, j]})
        df.plot(ax=axes[j],
                title = col,
                secondary_y='Synthetic data', style=['-', '--'])
    fig.tight_layout()

    save_path = os.path.join(output_folder, 'img')
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    png_filename = f"{file_name}.png"
    save_path = os.path.join(save_path, png_filename)
    plt.savefig(save_path, dpi=200)
    
    # 关闭绘图窗口，不显示图形
    plt.close()
    
    return synth_df

In [10]:
'''
dataset 代表特征数据
start_index 代表从数据的第几个索引值开始取
history_size 滑动窗口大小
end_index 代表数据取到哪个索引就结束
target_size 代表选取未来某一时间点还是时间段
step 代表在滑动窗口中每隔多少步取一组特征
'''
 
def TimeSeries(dataset, start_index, history_size, end_index, step,
               target_size):
    
    data = []  # 保存特征数据
    
    start_index = start_index + history_size  # 第一次的取值范围[0:start_index]
    
    # 如果没有指定滑动窗口取到哪个结束，那就取到最后
    if end_index is None:
        # 数据集最后一块是用来作为标签值的，特征不能取到底
        end_index = len(dataset) - target_size
        
    # 滑动窗口的起始位置到终止位置每次移动一步
    for i in range(start_index, end_index):
        
        # 滑窗中的值不全部取出来用，每隔60min取一次
        index = range(i-history_size, i, step)  # 第一次相当于range(0, start_index, 4)
        
        # 根据索引取出所有的特征数据的指定行
        data.append(dataset.iloc[index])
    
    # 返回划分好了的时间序列特征
    return np.array(data)

In [11]:
'''
函数名：PCA_tSNE
功  能：将数据通过PCA和t-SNE降维后绘制对比图，然后存贮在img目录下
参  数：df:DataFrame格式的文件数据
        id:生成数据的类别
        output_folder:存储图片的路径
        file_name:处理文件的名称
输  出：无

'''

def PCA_tSNE(df, id, output_folder, file_name):
    synth_df = df.copy()
    
    #获取真实数据
    real_folder = '../temp/final'
    real_path = os.path.join(real_folder, id, 'final(english).csv')
    real_df = load_data(real_path)
    
    #获取用于降维的特征
    feature_to_analyse = ["amount", "weekday", "temperature", "station_pressure",
                          "sea_level_pressure", "relative_humidity", "windspeed",
                          "timestamp","merchant_code"
    ]
    
     # 选择需要归一化的列（如果有多个列）
    columns_to_normalize = ['amount', 'temperature', 'station_pressure', 'sea_level_pressure', 'relative_humidity', 'windspeed',
                           'timestamp']

    # 初始化 MinMaxScaler
    real_scaler = MinMaxScaler()

    # 对需要归一化的列进行归一化操作
    real_df[columns_to_normalize] = real_scaler.fit_transform(real_df[columns_to_normalize])

    real_data = real_df[feature_to_analyse]
    synth_data = synth_df[feature_to_analyse]
    
    # 从真实数据中随机抽取与合成数据行数相同数量的数据
    if len(real_data) > len(synth_data):
        length = len(synth_data)
        real_data = real_data.sample(n=length, replace=False)
    else:
        length = len(real_data)
        synth_data = synth_data.sample(n=length, replace=False)
    sample = length
    history_size = 8
    target_size =  0
    step = 1  

    real_data =  TimeSeries(dataset=real_data, start_index=0, history_size=history_size, end_index=sample,
                        step=step, target_size=target_size)

    synth_data =  TimeSeries(dataset=synth_data, start_index=0, history_size=history_size, end_index=sample,
                        step=step, target_size=target_size)
    
    real_data_reduced = real_data.reshape(-1, history_size) #(?, 9)
    synth_data_reduced = synth_data.reshape(-1,history_size) #(?, 9)
    
    shape = real_data_reduced.shape[0]

    n_components = 2
    pca = PCA(n_components=n_components)
    tsne = TSNE(n_components=n_components, n_iter=300)

    pca.fit(real_data_reduced)

    pca_real = pd.DataFrame(pca.transform(real_data_reduced))
    pca_synth = pd.DataFrame(pca.transform(synth_data_reduced))

    data_reduced = np.concatenate((real_data_reduced, synth_data_reduced), axis=0)
    tsne_results = pd.DataFrame(tsne.fit_transform(data_reduced))
    
    fig = plt.figure(constrained_layout=True, figsize=(20, 10))
    spec = gridspec.GridSpec(ncols=2, nrows=1, figure=fig)

    ax = fig.add_subplot(spec[0,0])
    ax.set_title('PCA results',
                 fontsize=20,
                 color='red',
                 pad=10)

    # PCA 散点图
    plt.scatter(pca_real.iloc[:, 0].values, pca_real.iloc[:, 1].values,
                c='black', alpha=0.5, label='Original', s=100)
    plt.xlim(-1.5, 1.5)
    plt.ylim(-1.5, 1.5)

    plt.scatter(pca_synth.iloc[:, 0], pca_synth.iloc[:, 1],
                c='red', alpha=0.5, label='Synthetic', s=100)
    plt.xlim(-1.5, 1.5)
    plt.ylim(-1.5, 1.5)

    ax.legend()

    ax2 = fig.add_subplot(spec[0,1])
    ax2.set_title('TSNE results',
                  fontsize=20,
                  color='red',
                  pad=10)

    # t-SNE 散点图
    plt.scatter(tsne_results.iloc[:shape, 0].values, tsne_results.iloc[:shape, 1].values,
                c='black', alpha=0.5, label='Original')
    plt.scatter(tsne_results.iloc[shape:, 0], tsne_results.iloc[shape:, 1],
                c='red', alpha=0.5, label='Synthetic')

    ax2.legend()

    fig.suptitle('Validating synthetic vs real data diversity and distributions',
                 fontsize=16,
                 color='grey')
    
    save_path = os.path.join(output_folder, 'img')
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    png_filename = f"PCA_{file_name}.png"
    save_path = os.path.join(save_path, png_filename)
    plt.savefig(save_path, dpi=200)
    
    plt.close()

In [13]:
#生成数据的种类
id = 'lu'

# 输入文件夹路径
input_folder = os.path.join('synth_data', id)

# 输出文件夹路径
output_folder = os.path.join('process_data', id)

# 循环读取文件并处理
for filename in os.listdir(input_folder):
    if filename.endswith(".csv"):  # 确保只读取CSV文件
        filepath = os.path.join(input_folder, filename)
        df = load_data(filepath, encode='utf-8')
        # 获取原始文件的文件名（不包含扩展名）
        file_name = os.path.splitext(filename)[0]
        filtered_df = remove_multi(df)
        filtered_df = add_col(filtered_df)
        filtered_df = add_weather(filtered_df)
        filtered_df = transform_time(filtered_df)
        filtered_df = add_course(filtered_df, id)
        filtered_df = transform_merchant(filtered_df)
        filtered_df = draw_plot(filtered_df, id, output_folder, file_name)
        PCA_tSNE(filtered_df, id, output_folder, file_name)
        save_path = os.path.join(output_folder, filename)
        filtered_df.to_csv(save_path , sep= ',', encoding='utf-8',index=False)