In [42]:
#导入需要的包
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import time
import datetime
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime, timedelta
from chinese_calendar import is_holiday

In [43]:
#从文件夹中读取csv格式文件并将其转换为DataFrame
def load_data(file_path, person_name):
	'''
    file_path : 文件路径名，即Data或temp

    file_name : 文件名，如：lai.csv
    '''
	df = pd.read_csv("./" + file_path + "/" + person_name+'.csv', encoding='utf-8')
	columns = df.columns
	df.fillna(df.mean(numeric_only=True), inplace=True)
	return df

In [44]:
#截取样本并根据列值去除异常样本
def cut_data(df, column_name, cut_index, abnormal_list):
	'''
    df : DataFrame格式数据

    column_name : 将要作为截取依据的列，如：要去除非用餐样本，可以将“商户名称”作为依据

    cut_index : 截取的下标，如：在lai.csv中一区数据从591开始

    abnormal_list : 异常样本列表
    '''
	cut_df = df[:cut_index]

	# 创建一个空的 DataFrame 用于存储筛选后的数据
	new = pd.DataFrame(columns=df.columns)

	for index, row in cut_df.iterrows():
		if row[column_name] in abnormal_list:
			continue
		else:
			# 将不满足条件的行添加到 filtered_df
			new = pd.concat([new, row.to_frame().T], ignore_index=True)
	return new

In [45]:
#将交易时间这项特征分割开
def split_time(df):
    new=df
    new['交易时间']=pd.to_datetime(df['交易时间'], errors='coerce').dt.strftime('%Y/%m/%d %H:%M')
    temp=new['交易时间'].str.split(' ')
    new['交易日期']=temp.str[0]
    new['具体时间']=temp.str[1]
    return new

In [46]:

#将前后间隔不超过30分钟且刷卡地点相同的数据合并
def merge_data(df):
	'''
    df : DataFrame格式数据
    '''
	#创建一个空的DataFrame或列表来存储处理后的数据。
	processed_data = pd.DataFrame(columns=df.columns)

	prev_row = None

	for index, row in df.iterrows():
		if prev_row is None:
			prev_row = row
		else:
			time_diff = pd.to_datetime(row['交易时间']) - pd.to_datetime(row['交易时间'])
			if (time_diff <= pd.Timedelta(minutes=30)) and (row['商户名称'] == prev_row['商户名称']) and (
					row['学工号'] == prev_row['学工号']):
				prev_row['交易额'] += row['交易额']
			else:
				processed_data = pd.concat([processed_data, prev_row.to_frame().T], ignore_index=True)
				prev_row = row

	# 处理最后一行数据
	processed_data = pd.concat([processed_data, prev_row.to_frame().T], ignore_index=True)
	return processed_data

In [47]:
#添加特征列，列值为时间的小时整点，用于与天气数据连接
def time2int(df):
	'''
    df : DataFrame格式数据
    '''
	# 将交易时间数据保存在一个名为"trade_times"的列表中
	trade_times = df['具体时间']

	processed_times = []

	for time_str in trade_times:
		# 将时间字符串转换为datetime对象
		dt = datetime.strptime(time_str, "%H:%M")
		# 如果分钟大于等于30，则小时进一位，并将分钟置为0
		if dt.minute >= 30:
			dt = dt.replace(hour=dt.hour + 1, minute=0)
		else:
			dt = dt.replace(minute=0)
		# 将处理后的时间添加到列表中
		processed_times.append(dt)

	df['时间'] = processed_times
	return df

In [48]:
# #将星期特征转换为数值，如 ：星期二 -> 2
# def week2num(df):
# 	'''
#     df : DataFrame格式数据
#     '''
# 	week = []
#
# 	for w in df['星期']:
# 		if w == '星期一':
# 			week.append(1)
# 		elif w == '星期二':
# 			week.append(2)
# 		elif w == '星期三':
# 			week.append(3)
# 		elif w == '星期四':
# 			week.append(4)
# 		elif w == '星期五':
# 			week.append(5)
# 		elif w == '星期六':
# 			week.append(6)
# 		else:
# 			week.append(7)
#
# 	df['星期'] = week
# 	return df

In [49]:
#将特殊天象数值化，若为空则为0，否则为1
def weather2num(df):
	'''
    df : DataFrame格式数据
    '''
	w_list = []
	weather = df['特殊天象'].values

	for w in weather:
		if str(w) == 'nan':
			w_list.append(0)
		else:
			w_list.append(1)
	df['特殊天象'] = w_list
	return df

In [50]:
#增添节假日特征
def add_holiday(df):
	'''
    df : DataFrame格式数据
    '''
	holiday = []

	for date in df['交易日期']:
		date = datetime.strptime(date, "%Y/%m/%d").date()
		if is_holiday(date):
			holiday.append(1)
		else:
			holiday.append(0)

	df['节假日'] = holiday
	return df

In [51]:
#增添时间戳特征
def add_timestamp(df):
	'''
    df : DataFrame格式数据
    '''
	df['时间戳'] = df['交易时间']
	df['时间戳'] = df['时间戳'].apply(lambda x: time.mktime(time.strptime(x, '%Y/%m/%d %H:%M')))
	return df

In [52]:
#将原数据与天气数据结合起来
def combine_climate(df, climate):
	'''
    df : DataFrame格式数据

    climate : DataFrame格式数据，包含天气信息
    '''
    
	df['交易日期']=pd.to_datetime(df['交易日期'])

	print(type(df['交易日期'].dt.strftime('%Y/%m/%d')),type(df['时间'][:2]))
	df['日期小时'] = df['交易日期'].dt.strftime('%Y/%m/%d') + ' ' + df['时间'].astype(str).str[:2]
	climate['日期小时'] = pd.to_datetime(climate['date']).dt.strftime('%Y/%m/%d') + ' ' + climate['时间'].astype(str).str[:2]

	merged_df = pd.merge(df, climate, on='日期小时')
	merged_df = merged_df.drop(['海平面大气压','相对湿度','气象站大气压', '姓名', '日期小时', 'date', '时间_x', '时间_y', '重要天象'], axis=1)
	return merged_df

In [53]:
#保存数据为csv文件
def save_data(df, file_path, file_name):
	'''
    df : DataFrame格式数据

    file_path : 文件路径名，即Data或temp

    file_name : 文件名，如：lai.csv
    '''

	df.to_csv(file_path + '/' + file_name+'.csv',encoding='gbk', index=True)

In [54]:
#筛选合格的样本，如去除消费值过低（可能是买水等因素）的数据
def select_data(df):
	'''
    df : DataFrame格式数据

    '''
	# 创建一个时间对象表示 10:00，用于区分上午和其他时段（早餐消费可能偏低）
	threshold_time = pd.to_datetime('10:00', format='%H:%M')

	# 创建一个空的 DataFrame 用于存储筛选后的数据
	filtered_df = pd.DataFrame(columns=df.columns)

	# 迭代 DataFrame 行
	for index, row in df.iterrows():
		if float(row['交易额']) <= 5 and pd.to_datetime(row['交易时间'], format='%H:%M') > threshold_time:
			# 跳过满足条件的行
			continue
		elif float(row['交易额']) <= 2 and pd.to_datetime(row['交易时间'], format='%H:%M') < threshold_time:
			# 跳过满足条件的行
			continue
		else:
			# 将不满足条件的行添加到 filtered_df
			filtered_df = pd.concat([filtered_df, row.to_frame().T], ignore_index=True)
	return filtered_df

In [55]:
# 根据课表信息进行处理
def create_class(file_path, person_name):
    '''
:param file_path: 文件的读取路径
:return: 个人课表
'''

    # 数据读取与拼接
    original_personal_class1 = load_data(file_path+'/'+person_name, '2022秋'+person_name)
    original_personal_class2 = load_data(file_path+'/'+person_name, '2022夏'+person_name)
    original_personal_class3 = load_data(file_path+'/'+person_name, '2023春'+person_name)
    original_personal_class = pd.concat([original_personal_class1, original_personal_class2], copy=True,ignore_index=True)
    original_personal_class = pd.concat([original_personal_class, original_personal_class3], copy=True,ignore_index=True)
    save_data(original_personal_class,'DataSet/课表/'+person_name,person_name)
    # # 时间处理,将其转换为如2023/01/05（空缺补0）的形式
    # original_personal_class['Start Date'] = pd.to_datetime(
    # 	original_personal_class['Start Date'])
    # original_personal_class['Start Date'] = original_personal_class['Start Date'].dt.strftime(
    # 	'%Y/%m/%d')
    # original_personal_class['End Date'] = pd.to_datetime(
    # 	original_personal_class['End Date'])
    # original_personal_class['End Date'] = original_personal_class['End Date'].dt.strftime(
    # 	'%Y/%m/%d')
    original_personal_class['Date']=pd.to_datetime(original_personal_class['Date']).dt.strftime('%Y/%m/%d')
    courseType = ['Noon', 'Afternoon', 'Evening', 'FUCK']
    Begin=datetime(2022,7,13)
    End=datetime(2023,6,16)
    dataList = pd.date_range(
        start='20220713', end='20230616').strftime('%Y/%m/%d').tolist()
    personal_class = pd.DataFrame(0, index=courseType, columns=dataList)

    for index, row in original_personal_class.iterrows():
        Date=pd.to_datetime(row['Date'])
        if Date>End or Date<Begin:
            continue
        if row['End Time'] == '11:45:00':
            personal_class.loc['Noon',row['Date']] = 1
        elif row['End Time'] == '17:25:00':
            personal_class.loc['Afternoon',row['Date']] = 1
        elif row['End Time'] == '22:15:00':
            personal_class.loc['Evening',row['Date']] = 1
        if row['Start Time'] == '08:00:00':
            personal_class.loc['FUCK', row['Date']] = 1
    return personal_class


In [56]:
# 判断当前交易行为与课程间的关系
def combine_class(df, personal_class):

    temp=df
    Noon=[]
    Afternoon=[]
    Evening=[]
    FUCK=[]
    Begin=datetime(2022,7,13)
    End=datetime(2023,6,30)
    for index, row in temp.iterrows():
        if row['交易日期']>End or row['交易日期']<Begin or row['节假日']==1:
            Noon.append(-1)
            Afternoon.append(-1)
            Evening.append(-1)
            FUCK.append(-1)
            continue
        Noon.append(personal_class.loc['Noon',row['交易日期'].strftime('%Y/%m/%d')])
        Afternoon.append(personal_class.loc['Afternoon',row['交易日期'].strftime('%Y/%m/%d')])
        Evening.append(personal_class.loc['Evening',row['交易日期'].strftime('%Y/%m/%d')])
        FUCK.append(personal_class.loc['FUCK',row['交易日期'].strftime('%Y/%m/%d')])
    temp['Mealtime_N']=Noon
    temp['Mealtime_A']=Afternoon
    temp['Mealtime_E']=Evening
    temp['Eight']=FUCK
    return temp


In [57]:
from ics import Calendar
import pandas as pd

def ics_to_csv(ics_file_path, csv_file_path):
    # 从ICS文件中解析日历事件
    with open(ics_file_path, 'r', encoding='utf-8') as ics_file:
        c = Calendar(ics_file.read())

    # 创建一个空的DataFrame来存储日历事件
    events_list = []

    # 遍历每个事件并将其转换为字典
    for event in c.events:
        event_dict = {
            'Summary': event.name,
            'Start Time': event.begin,
            'End Time': event.end,
            # 在这里可以添加其他需要的字段
        }
        st= datetime.fromisoformat(str(event_dict['Start Time']).replace("Z", "+00:00"))
        event_dict['Start Time']=st.strftime('%H:%M:%S')
        ed= datetime.fromisoformat(str(event_dict['End Time']).replace("Z", "+00:00"))
        event_dict['End Time']=ed.strftime('%H:%M:%S')
        event_dict['Date']=st.strftime('%Y/%m/%d')
        events_list.append(event_dict)

    # 将事件列表转换为DataFrame
    df = pd.DataFrame(events_list)

    # 将DataFrame保存为CSV文件
    df.to_csv(csv_file_path, index=False)

def transform_ics(person_name):
    ics_file_path = './DataSet/课表/'+person_name+'/2022夏'+person_name+'.ics'  # 替换为您的ICS文件路径
    csv_file_path = './DataSet/课表/'+person_name+'/2022夏'+person_name+'.csv'  # 替换为输出CSV文件的路径
    ics_to_csv(ics_file_path, csv_file_path)
    ics_file_path = './DataSet/课表/'+person_name+'/2022秋'+person_name+'.ics'  # 替换为您的ICS文件路径
    csv_file_path = './DataSet/课表/'+person_name+'/2022秋'+person_name+'.csv'  # 替换为输出CSV文件的路径
    ics_to_csv(ics_file_path, csv_file_path)
    ics_file_path = './DataSet/课表/'+person_name+'/2023春'+person_name+'.ics'  # 替换为您的ICS文件路径
    csv_file_path = './DataSet/课表/'+person_name+'/2023春'+person_name+'.csv'  # 替换为输出CSV文件的路径
    ics_to_csv(ics_file_path, csv_file_path)
    


In [58]:
#通过更改文件名，处理不同组数据
person_name='lai'
df = load_data('DataSet/Data', person_name)
climate = load_data('DataSet', 'climate')

abnormal_merchant = np.array(['淘乐学苑水果', '学苑食堂霞姐饮品店', '中央红小月亮门店5', '中央红小月亮门店2' \
								 , '中央红小月亮门店4', '中央红小月亮门店6', '中央红小月亮门店3', '中央红小月亮正心楼' \
								 , '中央红-水果', '哈尔滨市南岗区淘乐水果捞店', '中央红小月亮门店1', '中央红-药店' \
								 , '深澜网费对接', '中央红-辣货', '紫丁香餐吧酒水（聚鑫食品）', '回味斋一餐厅酒水组' \
								 , '美芝林快客', '灌制间'])
df = cut_data(df, '商户名称', 405, abnormal_merchant)
df = split_time(df)
df = merge_data(df)
df = time2int(df)
df = add_holiday(df)
df = combine_climate(df, climate)
save_data(df, './DataSet/temp', 'temp1')
df = add_timestamp(df)
df = weather2num(df)
transform_ics(person_name)
personal_class=create_class('DataSet/课表',person_name)
df = combine_class(df,personal_class)

<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>


In [59]:
#保存到temp目录下
save_data(df, './DataSet/Final',person_name)