In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
data_hour = pd.read_csv('./hourly_dataset.csv') #每小时间隔流量数据集，含20个小区（01-20），多个表格间对于小区的编码一致
sub       = pd.read_csv('./sample_submission.csv') #提交样例
test      = pd.read_csv('./test_public.csv') #测试集（小时单位），须提交20个小区、4个不连续周的供水量。也即672（小时数） x 20（小区数）的矩阵
weather   = pd.read_csv('./weather.csv') #深圳市天气数据，测试集部分假定未知

def time_transfer(df):
    df['time'] = pd.to_datetime(df['time'])
    df = df.sort_values(by=['time'])
    return df

data_hour = time_transfer(data_hour)
weather = time_transfer(weather)
# data_hour = pd.merge(data_hour,weather,on=['time','train or test'],how='left')
for i in range(20):
    data_hour[f'flow_{i+1}'][data_hour[f'flow_{i+1}']<0] = 0

In [3]:
data_hour = pd.read_csv('./hourly_dataset.csv') #每小时间隔流量数据集，含20个小区（01-20），多个表格间对于小区的编码一致
sub       = pd.read_csv('./sample_submission.csv') #提交样例
test      = pd.read_csv('./test_public.csv') #测试集（小时单位），须提交20个小区、4个不连续周的供水量。也即672（小时数） x 20（小区数）的矩阵
weather   = pd.read_csv('./weather.csv') #深圳市天气数据，测试集部分假定未知


data_hour = time_transfer(data_hour)
weather = time_transfer(weather)


# data_hour = pd.merge(data_hour,weather,on=['time','train or test'],how='left')
for i in range(20):
    data_hour[f'flow_{i+1}'][data_hour[f'flow_{i+1}']<0] = np.nan



In [4]:
test_list1 = test.groupby('train or test')['time'].first().reset_index()
test_list1 = test_list1['time'].values.tolist()
test_list2 = test.groupby('train or test')['time'].last().reset_index()
test_list2 = test_list2['time'].values.tolist()
test_list1.extend(test_list2)
test_list1.sort()

# train1 test1
train1_ = data_hour[data_hour['time']<test_list1[0]].reset_index(drop=True)
test1_  = data_hour[(data_hour['time']>=test_list1[0]) & (data_hour['time']<=test_list1[1])].reset_index(drop=True)
#train2 test2
train2_ = data_hour[(data_hour['time']>test_list1[1])  & (data_hour['time']<test_list1[2])].reset_index(drop=True)
test2_  = data_hour[(data_hour['time']>=test_list1[2]) & (data_hour['time']<=test_list1[3])].reset_index(drop=True)
#train3 test3
train3_ = data_hour[(data_hour['time']>test_list1[3])  & (data_hour['time']<test_list1[4])].reset_index(drop=True)
test3_  = data_hour[(data_hour['time']>=test_list1[4]) & (data_hour['time']<=test_list1[5])].reset_index(drop=True)
#train4 test4
train4_ = data_hour[(data_hour['time']>test_list1[5])  & (data_hour['time']<test_list1[6])].reset_index(drop=True)
test4_  = data_hour[(data_hour['time']>=test_list1[6]) & (data_hour['time']<=test_list1[7])].reset_index(drop=True)

In [5]:
def preprocess_single_flow(train, flow):
    if np.isnan(train.loc[0, flow]):
        train.loc[0, flow] = train.loc[:, flow].mean()
    #最后一个值为nan，就标记为有值
    if np.isnan(train.loc[len(train)-1, flow]):
        train.loc[len(train)-1, flow] = 0
    cur_train = train[flow].copy()
    cur_train[cur_train.notnull()]=0
    cur_train[cur_train.isnull()]=1
    begin_index = cur_train.diff()[cur_train.diff()==1].index.values.tolist()
    end_index = cur_train.diff()[cur_train.diff()==-1].index.values.tolist()
    for index in range(len(begin_index)):
        fill_null = train[flow].loc[begin_index[index]-1]/(end_index[index]-begin_index[index]+1)
        # 异常值均匀化填补空值
        train[flow].loc[begin_index[index]-1:end_index[index]-1] = fill_null
    return train[flow]

def preprocess_all_flow(train):
    for i in range(20):
        train[f'flow_{i+1}'] = preprocess_single_flow(train, f'flow_{i+1}')
    return train

train1_ = preprocess_all_flow(train1_)
train2_ = preprocess_all_flow(train2_)
train3_ = preprocess_all_flow(train3_)
train4_ = preprocess_all_flow(train4_)

In [6]:
def make_diff_features(df, flow):
    seq_len = 7*24
    df[f"{flow}_simple_diff1"] = df[flow].diff()
    df[f"{flow}_simple_shift24"] = df[flow].shift(24)
    df[f"{flow}_simple_diff168"] = df[flow].diff(seq_len)
    return df

def make_trigonometric_features(df, flow, n=24):
    df[f'{flow}_sin'] = round(np.sin(2*np.pi/n*df[flow]), 6)
    df[f'{flow}_cos'] = round(np.cos(2*np.pi/n*df[flow]), 6)
    return df.iloc[:,-2:]

def make_feas(data):
    data = data.drop(['time','train or test'],axis=1)
    cur1, cur2 = [], []
    for flow in tqdm(range(20)):
        train_data_x = pd.DataFrame(data.loc[:,[f'flow_{flow+1}']]) #,'R', 'fx', 'T', 'U', 'fs', 'V', 'P']])
        cur1.append(make_diff_features(train_data_x, f'flow_{flow+1}').interpolate().fillna(method='bfill'))
        cur2.append(make_trigonometric_features(train_data_x, f'flow_{flow+1}', n=24))
    cur1 = np.stack(cur1,axis=0).transpose(1,0,2)
    cur2 = np.stack(cur2,axis=0).transpose(1,0,2)
    return np.concatenate((cur1,cur2), axis=2)

def generate_dataset(data, seq_len, pre_len, split_ratio=0.8):
    train_x, train_y, val_x, val_y, test_x = [], [], [], [], np.expand_dims(data[-pre_len:],axis=[0])
    split_size = int(len(data)*split_ratio)
    train_data = data[:split_size]
    val_data   = data[split_size:]
    for i in range(0, len(train_data)-seq_len-pre_len, seq_len):
        train_x.append(train_data[i:i+seq_len])
        train_y.append(train_data[i+seq_len:i+seq_len+pre_len])
    for i in range(0, len(val_data)-seq_len-pre_len, seq_len):
        val_x.append(val_data[i:i+seq_len])
        val_y.append(val_data[i+seq_len:i+seq_len+pre_len])
    train_x, train_y, val_x, val_y = np.array(train_x), np.array(train_y), np.array(val_x), np.array(val_y)
    return train_x, train_y, val_x, val_y, test_x
# 做特征
a = make_feas(train1_)
b = make_feas(pd.concat([train1_,train2_]))
c = make_feas(pd.concat([train1_,train2_,train3_]))
d = make_feas(pd.concat([train1_,train2_,train3_,train4_]))
print(a.shape, b.shape, c.shape, d.shape)
# 构造x和y数据
train1_x, train1_y, val1_x, val1_y, test1_x = generate_dataset(a, 24*7, 24*7)
train2_x, train2_y, val2_x, val2_y, test2_x = generate_dataset(b, 24*7, 24*7)
train3_x, train3_y, val3_x, val3_y, test3_x = generate_dataset(c, 24*7, 24*7)
train4_x, train4_y, val4_x, val4_y, test4_x = generate_dataset(d, 24*7, 24*7)

100%|██████████| 20/20 [00:00<00:00, 173.88it/s]
100%|██████████| 20/20 [00:00<00:00, 173.17it/s]
100%|██████████| 20/20 [00:00<00:00, 176.68it/s]
100%|██████████| 20/20 [00:00<00:00, 140.84it/s]

(2880, 20, 6) (3456, 20, 6) (4488, 20, 6) (5064, 20, 6)





In [7]:
def lightgbm_train(train_x, train_y, val_x, val_y, test_x):
    fea_nums = train_x.shape[-1]
    scores = []
    predictions = []
    for flow in tqdm(range(20)):
        train_data_x = pd.DataFrame(train_x[:, :, flow, :].reshape(-1, fea_nums))
        train_data_y = pd.DataFrame(train_y[:, :, flow, :].reshape(-1, fea_nums)).iloc[:,0]
        val_data_x   = pd.DataFrame(val_x[:, :, flow, :].reshape(-1, fea_nums))
        val_data_y   = pd.DataFrame(val_y[:, :, flow, :].reshape(-1, fea_nums)).iloc[:,0]
        test_data_x  = pd.DataFrame(test_x[:, :, flow, :].reshape(-1, fea_nums))

        train_part = lgb.Dataset(train_data_x, train_data_y)
        val_part = lgb.Dataset(val_data_x, val_data_y)
        ESR = 100
        NBR = 3000
        VBE = 100
        lgb_params_best = {'objective': 'regression',
                           'metric': ['mse'],
                           'bagging_seed': 2022,
                           'verbose': -1}
        lgb_model = lgb.train(lgb_params_best, train_part, num_boost_round=NBR,
                              valid_sets=[train_part, val_part],
                              valid_names=['train', 'valid'],
                              early_stopping_rounds=ESR, verbose_eval=None)
        score = mean_squared_error(train_data_y, lgb_model.predict(train_data_x))
        scores.append(round(score, 3))
        prediction_test = lgb_model.predict(test_data_x)
        predictions.append(prediction_test)
    return predictions, scores

In [8]:
predictions1, scores1 = lightgbm_train(train1_x, train1_y, val1_x, val1_y, test1_x)
print(scores1)
predictions2, scores2 = lightgbm_train(train2_x, train2_y, val2_x, val2_y, test2_x)
print(scores2)
predictions3, scores3 = lightgbm_train(train3_x, train3_y, val3_x, val3_y, test3_x)
print(scores3)
predictions4, scores4 = lightgbm_train(train4_x, train4_y, val4_x, val4_y, test4_x)
print(scores4)

100%|██████████| 20/20 [02:46<00:00,  8.30s/it]


[88.223, 6.365, 53.366, 36.359, 1.433, 95.098, 5.015, 0.593, 1.032, 0.732, 2.683, 0.677, 0.778, 1.123, 0.431, 3.222, 8.525, 1.825, 0.546, 0.498]


100%|██████████| 20/20 [03:17<00:00,  9.86s/it]


[56.331, 6.9, 7.199, 24.489, 0.718, 74.156, 5.522, 0.677, 0.939, 0.612, 2.274, 0.599, 0.675, 0.975, 0.379, 2.517, 6.78, 1.535, 0.518, 0.522]


100%|██████████| 20/20 [02:50<00:00,  8.52s/it]


[93.212, 6.422, 204.136, 62.252, 1.009, 114.573, 5.75, 0.543, 0.993, 0.773, 1.82, 0.901, 0.691, 0.885, 0.316, 1.942, 5.987, 2.081, 0.648, 0.572]


100%|██████████| 20/20 [02:48<00:00,  8.42s/it]

[96.749, 7.118, 223.561, 52.011, 1.066, 108.037, 5.766, 0.538, 1.019, 0.825, 1.671, 1.216, 0.593, 1.253, 0.289, 1.909, 5.072, 2.039, 0.707, 0.639]





In [9]:
result = np.concatenate((np.vstack(predictions1).transpose(1,0),
                         np.vstack(predictions2).transpose(1,0),
                         np.vstack(predictions3).transpose(1,0),
                         np.vstack(predictions4).transpose(1,0)))
result[result<0]=0
result = pd.concat([sub['time'],pd.DataFrame(result)],axis=1)
result.columns = sub.columns
result.to_csv('./lgb_baseline.csv',index=False,encoding='utf-8')
result

Unnamed: 0,time,flow_1,flow_2,flow_3,flow_4,flow_5,flow_6,flow_7,flow_8,flow_9,...,flow_11,flow_12,flow_13,flow_14,flow_15,flow_16,flow_17,flow_18,flow_19,flow_20
0,2022-05-01 01:00:00,27.709220,10.705443,43.317919,27.647086,2.579005,40.045006,5.340375,1.392468,2.316115,...,3.032334,1.974553,1.595645,1.527171,1.123252,2.871445,5.230473,3.118872,1.326130,1.358427
1,2022-05-01 02:00:00,25.134299,5.711133,35.403256,18.992146,2.276685,27.234427,2.668235,0.733532,1.343225,...,1.661315,1.705799,0.790639,0.784686,1.011170,1.771302,2.336396,2.664051,0.811462,0.412802
2,2022-05-01 03:00:00,18.949712,4.465108,33.047575,16.348291,2.138523,26.524499,2.533850,0.517597,1.309367,...,1.220401,1.753431,0.604036,0.344032,0.591441,1.022379,1.833989,2.664051,0.797649,0.412802
3,2022-05-01 04:00:00,18.661581,3.313725,30.374982,15.938176,1.995615,24.620727,2.533850,0.528483,1.297070,...,1.161377,1.726547,0.580285,0.354311,0.410783,0.891259,1.833989,2.523733,0.811462,0.322086
4,2022-05-01 05:00:00,21.679588,4.078497,33.047575,18.869687,2.132076,32.260014,2.626242,0.617558,1.547474,...,1.220401,1.164003,0.590354,0.609457,0.491290,0.973886,1.833989,1.513668,0.853933,0.922613
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667,2022-08-27 20:00:00,71.449656,28.298622,96.583382,66.426033,8.342530,115.946775,24.106232,3.332950,10.744994,...,7.789823,4.720148,5.382780,4.690226,3.211088,8.020117,13.204294,15.629605,3.914041,4.052091
668,2022-08-27 21:00:00,57.224577,40.794165,111.646904,72.676107,10.759275,148.211942,33.952243,4.263352,15.144793,...,11.362948,5.900896,5.937441,6.717000,4.677255,11.488857,17.088258,16.025458,5.823806,4.168951
669,2022-08-27 22:00:00,76.290744,42.293585,112.925219,81.816629,11.170417,153.276739,38.007980,4.877270,17.007439,...,14.590321,6.712959,6.680505,6.953112,4.355213,11.820239,20.217938,16.140706,6.962488,4.891003
670,2022-08-27 23:00:00,61.388159,33.177919,94.710262,91.883094,7.314822,115.783065,25.701638,4.003563,11.618414,...,9.965424,5.722889,6.650729,6.548429,4.496098,12.669119,15.925539,16.122686,6.889558,5.139643
