In [33]:
import os,time,code,logging,random,json,re
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import xgboost as xgb

def line2floats(line):
    return np.array([np.float64(i) for i in line.split(',')])
def lines2floatsmatrix(lines):
    return np.array([line2floats(line) for line in lines])
def read_file(filename):
    with open(filename,'r') as f:
        lines=f.read().split('\n')
    if len(lines)==1:
        return None
    lines=[line.strip() for line in lines if line!='']
    mtx=lines2floatsmatrix(lines)
    dtime=mtx[:,2:]-mtx[:,1:-1]
    return dtime
def filter_data(pattern_train,pattern_test):
    ptrain=re.compile(pattern_train)
    ptest=re.compile(pattern_test)
    xtrain,xtest,ytrain,ytest=[],[],[],[]
    for filename in os.listdir('data/exp8'):
        begin=filename.find('load')+4
        end=filename.find('packetsize')
        if ptrain.match(filename):
            load=np.float32(filename[begin:end])
            data=read_file('data/exp8/{}'.format(filename))
            if not data is None:
                xtrain.append(data)
                ytrain.append(load)
            else:
                print('file {} is empty!'.format(filename))
        elif ptest.match(filename):
            load=np.float32(filename[begin:end])
            data=read_file('data/exp8/{}'.format(filename))
            if not data is None:
                xtest.append(data)
                ytest.append(load)
            else:
                print('file {} is empty!'.format(filename))
    if len(xtrain)==0:
        return (xtrain,xtest,ytrain,ytest)
    n_feature=xtrain[0].shape[1]
    train_repeat_count=xtrain[0].shape[0]
    xtrain=np.array(xtrain).reshape((-1,n_feature))
    ytrain=np.repeat(np.array(ytrain),train_repeat_count)
    xtest=np.array(xtest).reshape((-1,n_feature))
    ytest=np.array(ytest)
    return (xtrain,xtest,ytrain,ytest)
def get_data():
    data_set=[]
    for packet_size in range(1500,9001,1500):
        pattern_train=r'^link1000load[0-9\.]+packetsize{}exp8\.txt'.format(packet_size)
        pattern_test=r'^link1000load[0-9\.]+packetsize{}exp8test[0-9]*\.txt'.format(packet_size)
        data=filter_data(pattern_train,pattern_test)
        data_set.append(data)
    return data_set
def precision_acc(pred,real,d):
    return np.mean(np.abs(pred-real)<=d)

def run_method(param,data,measurement,packet_size):
    t1=time.time()
    method=param['method']
    kargs=param['kargs']
    preds=method(data,kargs,packet_size)
    ret=measurement(preds,data)
    t2=time.time()
    return ret,t2-t1
def list_formatter(arr,fmt):
    return ' '.join(fmt.format(a) for a in arr)
def linear_method(data,kargs,packet_size):
    xtrain,xtest,ytrain,ytest=data
    ymax,ymin=np.max(ytrain),np.min(ytrain)
    recv_max=np.mean(xtrain[ytrain==ymax,0])
    recv_min=np.mean(xtrain[ytrain==ymin,0])
    recv_time=xtest[:,0]
    pred=(ymax-ymin)*(recv_time-recv_min)/(recv_max-recv_min)+ymin
    return pred
def lightgbm_method(data,kargs_,packet_size):
    xtrain,xtest,ytrain,ytest=data
    kargs=kargs_.copy()
    lgb_train=lgb.Dataset(xtrain,ytrain)
    num_boost_round=kargs['num_boost_round']
    kargs.pop('num_boost_round')
    gbm=lgb.train(kargs,lgb_train,num_boost_round=num_boost_round)
    pred=gbm.predict(xtest)
    return pred
def xgboost_method(data,kargs_,packet_size):
    xtrain,xtest,ytrain,ytest=data
    kargs=kargs_.copy()
    dtrain,dtest=xgb.DMatrix(xtrain,label=ytrain),xgb.DMatrix(xtest)
    num_round=kargs['num_round']
    kargs.pop('num_round')
    bst=xgb.train(kargs,dtrain,num_boost_round=num_round)
    pred=bst.predict(dtest)
    return pred

param_lightgbm={
    'method':lightgbm_method,
    'kargs':{
        'boosting_type': 'gbdt',
        'objective': 'l2',
        'metric': 'l2',
        'num_leaves': 1000,
        'learning_rate': 0.1,
        'feature_fraction': 1,
        'bagging_fraction': 1,
        'bagging_freq': 5,
        'verbose': -1,
        'num_boost_round':100,
    }
}
param_xgboost={
    'method':xgboost_method,
    'kargs':{
        'max_depth':6,
        'eta':0.06,
        'verbosity':0,
        'objective':'reg:squarederror',
        'num_round':200,
        'tree_method':'auto',
        'dump_model':None
    }
}

param_linear={
    'method':linear_method,
    'kargs':{
        'log':None
    }
}
param_dict={
    'linear':param_linear,
    'lightgbm':param_lightgbm,
    'xgboost':param_xgboost
}
def atomic_metric(pred,ytest):
    #pred ytest一一对应
    ret=[]
    for d in [0.5,1,2,5]:
        acc=precision_acc(pred,ytest,d)
        ret.append(acc)
    mae=np.mean(np.abs(pred-ytest))
    rmse=np.sqrt(mean_squared_error(pred,ytest))
    ret.extend([mae,rmse])
    return ret

def meas_test(pred,data):
    ret=[]
    xtrain,xtest,ytrain,ytest=data
    assert(xtest.shape[0]==3*ytest.shape[0])
    ratio=xtest.shape[0]//ytest.shape[0]
    ytest_full=np.repeat(ytest,ratio)
    ret1=atomic_metric(pred,ytest_full)
    pred_averaged=np.mean(pred.reshape((-1,3)), axis=1)
    ret2=atomic_metric(pred_averaged,ytest)
    return [ret1,ret2]

def run_experiment(data_set):
    for i,packet_size in enumerate(range(1500,9001,1500)):
        data=data_set[i]
        if len(data[0])==0:
            break
        method_list=param_dict.keys()
        r1,r2=[],[]
        for key in method_list:
            param=param_dict[key]
            ret,timecost=run_method(param,data,meas_test,packet_size)
            r1.append(ret)
            r2.append(timecost)
            performance1,performance2=np.around(ret[0],decimals=4),np.around(ret[1],decimals=4)
            
        print('experiment at {} result:'.format(packet_size))
        for i,name in enumerate(method_list):
            print('{} use {:.2f} s\n\t{}\n\t{}'.format(name,r2[i],list_formatter(r1[i][0],'{:.2f}'),list_formatter(r1[i][1],'{:.2f}')))
        print('\n')
def simple_run(data,method,packet_size):
    ret,timecost=run_method(method,data,meas_test,packet_size)
    return ret,timecost

def transform(data,delete_count,add_sum):
    xtrain,xtest,ytrain,ytest=data
    if delete_count>0:
        xtrain_=np.reshape(xtrain[:,delete_count:],(xtrain.shape[0],-1))
        xtest_=np.reshape(xtest[:,delete_count:],(xtest.shape[0],-1))
    else:
        xtrain_=xtrain
        xtest_=xtest
    if add_sum==True:
        sum1,sum2=np.reshape(np.sum(xtrain_,axis=1),(-1,1)),np.reshape(np.sum(xtest_,axis=1),(-1,1))
        xtrain_=np.concatenate((sum1,xtrain_),axis=1)
        xtest_=np.concatenate((sum2,xtest_),axis=1)
    return xtrain_,xtest_,ytrain,ytest
def transform2(data,sum_remove):
    xtrain,xtest,ytrain,ytest=data
    sum1,sum2=np.reshape(np.sum(xtrain[:,sum_remove:],axis=1),(-1,1)),np.reshape(np.sum(xtest[:,sum_remove:],axis=1),(-1,1))
    xtrain_=sum1
    xtest_=sum2
    return xtrain_,xtest_,ytrain,ytest

In [3]:
t1=time.time()
data_set=get_data()
t2=time.time()
print('prepare data use {} s'.format(t2-t1))

file link1000load580packetsize6000exp8test3.txt is empty!
prepare data use 366.5722584724426 s


In [48]:
#xgboost1500提升——无
data=data_set[0]
print(data[0][0].shape)
for i in range(0,22,1):
    data_=transform(data,i,True)
    ret,tmcost=simple_run(data_,param_xgboost,1500)
    print('{}: use {} s'.format(i,tmcost))
    print(list_formatter(ret[0],'{:.2f}'))
    print(list_formatter(ret[1],'{:.2f}'))

(999,)
0: use 49.0311713218689 s
0.25 0.47 0.75 0.98 1.48 3.61
0.39 0.68 0.91 1.00 0.90 2.10
1: use 50.140121936798096 s
0.25 0.46 0.74 0.98 1.50 3.63
0.39 0.66 0.91 1.00 0.91 2.10
2: use 50.335896730422974 s
0.24 0.45 0.74 0.98 1.50 3.62
0.39 0.67 0.92 1.00 0.91 2.10
3: use 48.68917798995972 s
0.24 0.45 0.74 0.98 1.49 3.61
0.39 0.66 0.92 1.00 0.90 2.10
4: use 51.86839437484741 s
0.24 0.45 0.74 0.98 1.49 3.60
0.38 0.66 0.92 1.00 0.91 2.10
5: use 51.714091777801514 s
0.24 0.45 0.76 0.98 1.46 3.59
0.39 0.67 0.92 1.00 0.89 2.09
6: use 49.479349851608276 s
0.26 0.48 0.77 0.98 1.42 3.56
0.40 0.68 0.93 1.00 0.87 2.08
7: use 50.763015270233154 s
0.26 0.48 0.77 0.98 1.42 3.57
0.40 0.69 0.92 1.00 0.88 2.09
8: use 53.94688653945923 s
0.25 0.47 0.76 0.98 1.46 3.59
0.40 0.68 0.92 1.00 0.89 2.10
9: use 52.36423397064209 s
0.25 0.47 0.76 0.98 1.45 3.59
0.40 0.67 0.92 1.00 0.90 2.10
10: use 51.019197940826416 s
0.25 0.47 0.76 0.98 1.46 3.60
0.39 0.68 0.92 1.00 0.89 2.11
11: use 51.570595264434814 s
0

xgboost 全数据
0: use 49.0311713218689 s
0.25 0.47 0.75 0.98 1.48 3.61
0.39 0.68 0.91 1.00 0.90 2.10
3: use 10.42889404296875 s      +20
0.45 0.70 0.90 1.00 0.91 3.96
0.62 0.86 0.97 1.00 0.57 2.29
xgboost recvtime             -10
7: use 1.4162912368774414 s
0.16 0.31 0.57 0.92 2.19 4.14
0.26 0.49 0.80 0.99 1.32 2.43
3: use 1.402277946472168 s      +10
0.35 0.56 0.75 0.92 1.61 4.43
0.47 0.69 0.86 0.99 0.99 2.57
线性
12: use 0.0020110607147216797 s  -10
0.15 0.29 0.53 0.88 2.46 4.40
0.20 0.37 0.66 0.97 1.73 2.77
14: use 0.0010008811950683594 s  +12
0.37 0.59 0.75 0.92 1.67 7.21
0.49 0.69 0.86 0.99 1.02 4.16

In [47]:
#xgboost 9000提升——舍弃前2个包及之后最好
for i in range(0,20,1):
    data=data_set[5]
    data_=transform(data,i,True)
    ret,tmcost=simple_run(data_,param_xgboost,9000)
    print('{}: use {} s'.format(i,tmcost))
    print(list_formatter(ret[0],'{:.2f}'))
    print(list_formatter(ret[1],'{:.2f}'))

0: use 10.370990514755249 s
0.33 0.58 0.85 0.99 1.15 3.97
0.49 0.78 0.96 1.00 0.71 2.29
1: use 10.434631824493408 s
0.36 0.64 0.89 1.00 1.01 3.90
0.56 0.84 0.97 1.00 0.61 2.25
2: use 10.602035999298096 s
0.41 0.68 0.89 1.00 0.94 3.96
0.61 0.85 0.98 1.00 0.57 2.29
3: use 10.42889404296875 s
0.45 0.70 0.90 1.00 0.91 3.96
0.62 0.86 0.97 1.00 0.57 2.29
4: use 10.630379438400269 s
0.43 0.68 0.89 1.00 0.94 3.98
0.61 0.85 0.97 1.00 0.58 2.32
5: use 10.028515100479126 s
0.43 0.69 0.89 1.00 0.93 4.00
0.62 0.85 0.97 1.00 0.57 2.32
6: use 9.845177173614502 s
0.43 0.69 0.90 1.00 0.92 4.02
0.62 0.86 0.98 1.00 0.57 2.33
7: use 10.228681564331055 s
0.42 0.68 0.89 1.00 0.95 4.05
0.61 0.85 0.97 1.00 0.58 2.35
8: use 10.109064817428589 s
0.43 0.69 0.89 1.00 0.94 4.06
0.63 0.86 0.97 1.00 0.57 2.35
9: use 10.236714839935303 s
0.43 0.68 0.89 0.99 0.94 4.06
0.61 0.85 0.97 1.00 0.58 2.37
10: use 10.112480163574219 s
0.42 0.68 0.89 1.00 0.95 4.11
0.61 0.85 0.97 1.00 0.59 2.39
11: use 10.005488395690918 s
0.43

In [36]:
#xgboost-recvtime-小提升
for i in range(0,22,1):
    data=data_set[0]
    data_=transform2(data,i)
    ret,tmcost=simple_run(data_,param_xgboost,1500)
    print('{}: use {} s'.format(i,tmcost))
    print(list_formatter(ret[0],'{:.2f}'))
    print(list_formatter(ret[1],'{:.2f}'))

0: use 1.4112858772277832 s
0.10 0.20 0.39 0.77 3.30 5.14
0.16 0.33 0.62 0.97 1.84 2.85
1: use 1.4413135051727295 s
0.13 0.27 0.49 0.86 2.64 4.55
0.20 0.39 0.71 0.99 1.57 2.61
2: use 1.4192934036254883 s
0.13 0.26 0.49 0.87 2.62 4.51
0.20 0.39 0.70 0.98 1.59 2.64
3: use 1.4157960414886475 s
0.12 0.25 0.47 0.88 2.61 4.46
0.21 0.39 0.69 0.98 1.58 2.62
4: use 1.3912684917449951 s
0.12 0.24 0.47 0.89 2.58 4.42
0.19 0.39 0.68 0.98 1.63 2.66
5: use 1.4757113456726074 s
0.14 0.27 0.50 0.90 2.42 4.31
0.20 0.40 0.71 0.99 1.54 2.58
6: use 1.4733483791351318 s
0.15 0.30 0.55 0.92 2.24 4.17
0.25 0.48 0.76 0.99 1.38 2.48
7: use 1.4162912368774414 s
0.16 0.31 0.57 0.92 2.19 4.14
0.26 0.49 0.80 0.99 1.32 2.43
8: use 1.4132881164550781 s
0.15 0.29 0.54 0.92 2.28 4.21
0.23 0.43 0.75 0.99 1.45 2.52
9: use 1.453324556350708 s
0.15 0.30 0.56 0.92 2.24 4.19
0.23 0.45 0.75 0.99 1.43 2.52
10: use 1.4087889194488525 s
0.17 0.33 0.58 0.92 2.17 4.18
0.26 0.49 0.79 0.99 1.35 2.48
11: use 1.5047438144683838 s
0.1

In [43]:
#xgboost-recvtime-小提升
for i in range(0,22,1):
    data=data_set[5]
    data_=transform2(data,i)
    ret,tmcost=simple_run(data_,param_xgboost,9000)
    print('{}: use {} s'.format(i,tmcost))
    print(list_formatter(ret[0],'{:.2f}'))
    print(list_formatter(ret[1],'{:.2f}'))

0: use 1.4687080383300781 s
0.14 0.26 0.45 0.80 3.00 5.31
0.16 0.33 0.63 0.97 1.84 3.05
1: use 1.4242987632751465 s
0.20 0.37 0.62 0.93 2.03 4.50
0.32 0.57 0.82 0.99 1.20 2.61
2: use 1.4903581142425537 s
0.21 0.39 0.61 0.93 2.02 4.56
0.37 0.57 0.81 0.99 1.19 2.65
3: use 1.402277946472168 s
0.35 0.56 0.75 0.92 1.61 4.43
0.47 0.69 0.86 0.99 0.99 2.57
4: use 1.4122872352600098 s
0.22 0.38 0.59 0.92 2.11 4.65
0.36 0.57 0.80 0.99 1.22 2.69
5: use 1.3812587261199951 s
0.23 0.42 0.64 0.92 1.96 4.60
0.38 0.60 0.82 0.99 1.16 2.69
6: use 1.3852758407592773 s
0.31 0.53 0.73 0.93 1.69 4.49
0.45 0.67 0.86 0.99 1.03 2.62
7: use 1.3926422595977783 s
0.23 0.37 0.57 0.92 2.15 4.72
0.36 0.56 0.79 0.99 1.24 2.74
8: use 1.4152891635894775 s
0.26 0.46 0.68 0.92 1.87 4.62
0.40 0.62 0.84 0.99 1.11 2.69
9: use 1.4022784233093262 s
0.25 0.45 0.68 0.92 1.88 4.62
0.39 0.62 0.84 0.99 1.12 2.69
10: use 1.4142885208129883 s
0.22 0.37 0.56 0.91 2.20 4.82
0.36 0.56 0.79 0.99 1.26 2.81
11: use 1.4623329639434814 s
0.3

In [37]:
#线性-有提升
for i in range(0,22,1):
    data=data_set[0]
    data_=transform(data,i,True)
    ret,tmcost=simple_run(data_,param_linear,1500)
    print('{}: use {} s'.format(i,tmcost))
    print(list_formatter(ret[0],'{:.2f}'))
    print(list_formatter(ret[1],'{:.2f}'))

0: use 0.0020101070404052734 s
0.05 0.11 0.23 0.56 5.11 6.88
0.01 0.03 0.07 0.54 4.81 5.37
1: use 0.002001523971557617 s
0.05 0.09 0.21 0.62 4.57 6.16
0.01 0.03 0.09 0.66 4.33 4.90
2: use 0.003003835678100586 s
0.06 0.13 0.26 0.67 4.10 5.73
0.04 0.07 0.18 0.77 3.74 4.41
3: use 0.002002716064453125 s
0.10 0.19 0.35 0.75 3.41 5.14
0.06 0.12 0.30 0.91 2.93 3.69
4: use 0.0020020008087158203 s
0.08 0.16 0.30 0.71 3.74 5.39
0.04 0.09 0.24 0.83 3.35 4.09
5: use 0.0020017623901367188 s
0.07 0.14 0.28 0.73 3.68 5.28
0.05 0.10 0.24 0.86 3.29 4.01
6: use 0.002002716064453125 s
0.07 0.15 0.30 0.77 3.42 5.03
0.07 0.14 0.31 0.90 2.97 3.75
7: use 0.002002239227294922 s
0.09 0.18 0.36 0.83 3.06 4.75
0.09 0.19 0.41 0.95 2.52 3.35
8: use 0.003004312515258789 s
0.08 0.16 0.32 0.77 3.41 5.06
0.07 0.14 0.32 0.88 2.97 3.78
9: use 0.001993894577026367 s
0.07 0.15 0.29 0.75 3.54 5.15
0.06 0.13 0.30 0.85 3.11 3.91
10: use 0.0020020008087158203 s
0.07 0.14 0.28 0.74 3.62 5.21
0.06 0.13 0.30 0.83 3.17 3.99
11: u

In [42]:
#线性-有提升
for i in range(0,22,1):
    data=data_set[5]
    data_=transform(data,i,True)
    ret,tmcost=simple_run(data_,param_linear,9000)
    print('{}: use {} s'.format(i,tmcost))
    print(list_formatter(ret[0],'{:.2f}'))
    print(list_formatter(ret[1],'{:.2f}'))

0: use 0.002010345458984375 s
0.18 0.31 0.45 0.79 3.16 7.45
0.16 0.30 0.60 0.92 2.16 4.42
1: use 0.0010013580322265625 s
0.20 0.38 0.61 0.89 2.27 6.92
0.20 0.41 0.70 0.97 1.72 4.15
2: use 0.0010018348693847656 s
0.21 0.37 0.59 0.93 2.11 6.82
0.34 0.56 0.79 0.99 1.30 3.96
3: use 0.002010822296142578 s
0.25 0.47 0.70 0.91 1.91 6.85
0.28 0.52 0.78 0.97 1.44 4.07
4: use 0.0020036697387695312 s
0.22 0.41 0.61 0.89 2.21 6.99
0.31 0.52 0.73 0.98 1.49 4.11
5: use 0.002002239227294922 s
0.19 0.35 0.56 0.91 2.28 7.01
0.26 0.47 0.71 0.97 1.61 4.20
6: use 0.0020017623901367188 s
0.33 0.53 0.71 0.90 1.83 6.95
0.37 0.58 0.79 0.98 1.33 4.10
7: use 0.0020017623901367188 s
0.22 0.39 0.58 0.91 2.21 7.05
0.35 0.54 0.77 0.99 1.37 4.10
8: use 0.0010004043579101562 s
0.22 0.40 0.65 0.93 2.00 7.01
0.36 0.58 0.82 0.98 1.24 4.08
9: use 0.0020017623901367188 s
0.29 0.51 0.70 0.91 1.90 7.08
0.40 0.60 0.81 0.98 1.24 4.12
10: use 0.0020020008087158203 s
0.23 0.37 0.55 0.91 2.25 7.20
0.35 0.55 0.77 0.98 1.36 4.18
1

In [50]:
#每个最佳
#xgboost
for i in range(1500,9001,1500):
    indice=(i-1500)//1500
    data=data_set[indice]
    best_value,best_i=0,0
    record=[]
    for j in range(0,30,1):
        data_=transform(data,j,True)
        ret,tmcost=simple_run(data_,param_xgboost,i)
        if ret[0][0]>best_value:
            best_value=ret[0][0]
            best_i=j
            record=ret
    print('{} best at removing {}:'.format(i,best_i))
    print(list_formatter(record[0],'{:.2f}'))
    print(list_formatter(record[1],'{:.2f}'))

1500 best at removing 23:
0.27 0.49 0.78 0.98 1.40 3.62
0.41 0.70 0.92 1.00 0.86 2.12
3000 best at removing 15:
0.29 0.51 0.80 0.99 1.34 6.46
0.44 0.74 0.94 1.00 0.82 3.74
4500 best at removing 9:
0.31 0.56 0.84 0.99 1.17 2.26
0.48 0.78 0.95 1.00 0.71 1.32
6000 best at removing 13:
0.35 0.60 0.84 0.99 1.10 1.71
0.52 0.79 0.96 1.00 0.66 1.01
7500 best at removing 10:
0.42 0.68 0.88 0.99 0.94 1.42
0.60 0.83 0.97 1.00 0.58 0.86
9000 best at removing 3:
0.45 0.70 0.90 1.00 0.91 3.96
0.62 0.86 0.97 1.00 0.57 2.29


1500 best at removing 23:
0.27 0.49 0.78 0.98 1.40 3.62
0.41 0.70 0.92 1.00 0.86 2.12
3000 best at removing 15:
0.29 0.51 0.80 0.99 1.34 6.46
0.44 0.74 0.94 1.00 0.82 3.74
4500 best at removing 9:
0.31 0.56 0.84 0.99 1.17 2.26
0.48 0.78 0.95 1.00 0.71 1.32
6000 best at removing 13:
0.35 0.60 0.84 0.99 1.10 1.71
0.52 0.79 0.96 1.00 0.66 1.01
7500 best at removing 10:
0.42 0.68 0.88 0.99 0.94 1.42
0.60 0.83 0.97 1.00 0.58 0.86
9000 best at removing 3:
0.45 0.70 0.90 1.00 0.91 3.96
0.62 0.86 0.97 1.00 0.57 2.29