In [3]:
# 统计训练集train_data12.csv, train_data.13.csv各个订单信息
# 此部分路径可能不完整，主要用于采集订单终点港口经纬度坐标
# 包括路由起终点聚类编号， 起终点港口经纬度坐标，起始港口，中间港口，终点港口以及总塞港时间
import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')

port_path = '../../dataset/test_data/ports_cluster.csv'
port_df = pd.read_csv(port_path)

start = time.process_time()

def distance(LonA, LatA, LonB, LatB):
    EARTH_RADIUS = 6371  # 千米

    def rad(d):
        return d * np.pi/ 180.0

    radLatA = rad(LatA)
    radLatB = rad(LatB)

    a = radLatA-radLatB
    b = rad(LonA)-rad(LonB)

    s = 2 * np.arcsin(np.sqrt(np.power(np.sin(a / 2),2)+ np.cos(radLatA) * np.cos(radLatB)*np.power(np.sin(b / 2),2)))
    s = s* EARTH_RADIUS

    #  保留两位小数
    s = np.round(s * 100)/100
    return s


# 获取航线长度
def get_distance(df):
    df = df.drop_duplicates(['geo_bin'], keep='last').reset_index(drop = True)
    dis = 0
    for i in range(len(df) - 1):
        dis += distance(df.loc[i, 'longitude'], df.loc[i, 'latitude'], df.loc[i+1, 'longitude'], df.loc[i+1, 'latitude'])
    return dis


# 获取订单特征
def order_feature(df, start_port_num, start_lon, start_lat, end_port_num, end_lon, end_lat, trace,  Dir, start_anchor_time, 
                  end_anchor_time, middle_anchor_time, anchor_time, dis):
    df['start_lon'] = start_lon
    df['start_lat'] = start_lat
    df['end_lon'] = end_lon
    df['end_lat'] = end_lat

    df['start_port'] = start_port_num
    df['end_port'] = end_port_num
    df['trace'] =trace
    df['Dir'] = Dir
    
    df['start_anchor_time'] = start_anchor_time
    df['end_anchor_time'] = end_anchor_time
    df['middle_anchor_time'] = middle_anchor_time
    df['anchor_time'] = anchor_time
    
    df['aver_speed'] = dis/df.loc[0, 'label']
    df['anchor_time'] = anchor_time
    df['dis'] = dis

    df = df[['loadingOrder', 'timestamp', 'carrierName', 'vesselMMSI', 'TRANSPORT_TRACE', 'start_port', 'start_lon', 'start_lat',
             'end_port', 'end_lon', 'end_lat', 'trace', 'Dir', 'start_anchor_time', 'end_anchor_time', 'middle_anchor_time', 'anchor_time',
             'aver_speed', 'dis', 'label']]

    return df.iloc[0, :]


# 获取港口编号
def get_port_num(port):
    # print(port)
    # print(len(port_df))
    for i in range(len(port_df)):
        if (str(port) == str(port_df.loc[i, 'TRANS_NODE_NAME'])):
            # print('ss1')
            return port_df.loc[i, 'cluster'], port_df.loc[i, 'LONGITUDE'], port_df.loc[i, 'LATITUDE']

    # 找不到相同路由，找相似路由
    for i in range(len(port_df)):
        if (str(port) in str(port_df.loc[i, 'TRANS_NODE_NAME'])):
            # print('ss2')
            return port_df.loc[i, 'cluster'], port_df.loc[i, 'LONGITUDE'], port_df.loc[i, 'LATITUDE']

        
def get_jam_info(train_df, start_lon, start_lat, end_lon, end_lat):
    train_df['start_lon'] = start_lon
    train_df['start_lat'] = start_lat
    train_df['end_lon'] = end_lon
    train_df['end_lat'] = end_lat
    
    
    train_df['lon_diff_start'] = abs(train_df['start_lon'] - train_df['longitude'])
    train_df['lat_diff_start'] = abs(train_df['start_lat'] - train_df['latitude'])
    train_df['lon_diff_end'] = abs(train_df['end_lon'] - train_df['longitude'])
    train_df['lat_diff_end'] = abs(train_df['end_lat'] - train_df['latitude'])
    
    train_df['minute_diff'] = train_df['timestamp'].diff(1).dt.total_seconds() / 60
    train_df['minute_diff'] = train_df['minute_diff'].fillna(0)
    
    # 计算起始港口塞港时间
    start_df = train_df[(train_df['minute_diff'] < 20) & (train_df['minute_diff'] > 0) & (train_df['speed'] < 3)
                           & (train_df['lon_diff_start'] < 0.25) & (train_df['lat_diff_start'] < 0.25)].reset_index(drop = True)
    start_anchor_time = np.sum(np.array(start_df.loc[:, 'minute_diff']))/60
    
    # 计算终点港口塞港时间
    end_df = train_df[(train_df['minute_diff'] < 20) & (train_df['minute_diff'] > 0) & (train_df['speed'] < 3)
                           & (train_df['lon_diff_end'] < 0.25) & (train_df['lat_diff_end'] < 0.25)].reset_index(drop = True)
    end_anchor_time = np.sum(np.array(end_df.loc[:, 'minute_diff']))/60
    
    # 去除终点港口塞港数据
    train_df = train_df[(train_df['lon_diff_end'] > 0.25) | (train_df['lat_diff_end'] > 0.25)].reset_index(drop = True)

    # 计算总塞港时间
    df = train_df[(train_df['minute_diff'] < 20) & (train_df['minute_diff'] > 0) & (train_df['speed'] < 3)].reset_index(drop = True)
    anchor_time = np.sum(np.array(df.loc[:, 'minute_diff']))/60
    
    # 中间港口塞港时间
    middle_anchor_time = anchor_time - start_anchor_time
    
    if start_anchor_time < 0.1:
        start_anchor_time = 0
    if end_anchor_time < 0.1:
        end_anchor_time = 0
    if middle_anchor_time < 0.1:
        middle_anchor_time = 0
    if anchor_time < 0.1:
        anchor_time = 0

    return train_df, start_anchor_time, end_anchor_time, middle_anchor_time, anchor_time


def get_order_info():
    order_info = pd.DataFrame()
    for i in range(12, 14):
        print('i:{}'.format(i))
        train_df = pd.read_csv('../../dataset/train_data/train_wash{}.csv'.format(i), header=None)
        train_df.columns = ['loadingOrder', 'carrierName', 'timestamp', 'longitude',
                            'latitude', 'vesselMMSI', 'speed', 'direction', 'vesselNextport',
                            'vesselNextportETA', 'vesselStatus', 'vesselDatasource', 'TRANSPORT_TRACE']
        
        train_df['timestamp'] = pd.to_datetime(train_df['timestamp'], infer_datetime_format = True)
        train_df['direction'] = train_df['direction'].astype(float) / 100

        # 经纬度编码
        a = 0.25  # 经纬度栅格化粒度
        train_df['geo_bin'] =  train_df['geo_bin'] = int(360//a) * ((train_df['latitude'] + 90)//a).astype(int) +\
        ((train_df['longitude'] + 360)//a).astype(int)   # GPS点编码

        # 遍历各个订单航行时间
        train_orders = train_df.loadingOrder.unique()
        for j in range(len(train_orders)):
            order_data = train_df[train_df['loadingOrder'] == train_orders[j]].reset_index(drop = True)
            
            # 获取路由
            trace = order_data.loc[0, 'TRANSPORT_TRACE'].split('-')
            print('\nnum{}:{}'.format(j, order_data.loc[0, 'TRANSPORT_TRACE']))

            # 获取起终港口编号
            start_port = trace[0]
            end_port = trace[-1]
            start_port_num, start_lon, start_lat = get_port_num(start_port)
            end_port_num, end_lon, end_lat = get_port_num(end_port)
            trace = str(start_port_num) + '-' + str(end_port_num)
            print('start_port:{}\tend_port:{}'.format(start_port_num, end_port_num))
            
            # 获取塞港时间，并清洗终点塞港数据
            order_data, start_anchor_time, end_anchor_time, middle_anchor_time,\
            anchor_time = get_jam_info(order_data, start_lon, start_lat, end_lon, end_lat)
            if (len(order_data) < 1):
                continue
            
            # label 确定, 在去重前面
            order_data['label'] = (order_data.loc[len(order_data)-1, 'timestamp']- order_data.loc[0, 'timestamp']).total_seconds()/3600
            
            # 获取航线长度
            order_data = order_data.drop_duplicates(['loadingOrder', 'geo_bin'], keep='last').reset_index(drop = True)
            dis = get_distance(order_data)
            
            # 获取航线方向
            Dir = np.mean(np.array(order_data.loc[:, 'direction']))
            
            # 构建航线特征
            df = order_feature(order_data, start_port_num, start_lon, start_lat, end_port_num, end_lon, end_lat,
                               trace, Dir, start_anchor_time, end_anchor_time, middle_anchor_time, anchor_time, dis)

            order_info = order_info.append(df, ignore_index = True)
        print('running file{}: {} seconds'.format(i, time.process_time() - start))
            
    order_info = order_info[['loadingOrder', 'timestamp', 'carrierName', 'vesselMMSI', 'TRANSPORT_TRACE', 'start_port',
                             'start_lon', 'start_lat', 'end_port', 'end_lon', 'end_lat', 'trace', 'Dir', 'start_anchor_time',
                             'end_anchor_time', 'middle_anchor_time', 'anchor_time', 'aver_speed', 'dis', 'label']]

    order_info = order_info.sort_values(['TRANSPORT_TRACE'], ascending = True)
    return order_info

order_info = get_order_info()
order_df1 = pd.read_csv('../../dataset/order_data/B_order_info.csv')
order_df = order_df1.append(order_info, ignore_index = True)
order_df = order_df[order_df1.columns]
order_df = order_df.drop_duplicates(['loadingOrder']).reset_index(drop = True)

order_df.to_csv('../../dataset/order_data/order_info_v2.csv', index = False)
print('running time: {} seconds'.format(time.process_time() - start))

i:12

num0:CNSHK-SGSIN
start_port:771	end_port:726

num1:CNSHK-SGSIN
start_port:771	end_port:726

num2:CNSHK-KRINC
start_port:771	end_port:829

num3:CNYTN-MXZLO
start_port:771	end_port:63

num4:CNSHK-IDJKT
start_port:771	end_port:743

num5:CNSHK-SGSIN
start_port:771	end_port:726

num6:CNSHK-SGSIN
start_port:771	end_port:726

num7:CNDCB-SGSIN
start_port:771	end_port:726

num8:CNDCB-SGSIN
start_port:771	end_port:726

num9:CNYTN-PAONX
start_port:771	end_port:175

num10:CNSHK-CLVAP
start_port:771	end_port:217

num11:CNYTN-BDCGP
start_port:771	end_port:705

num12:CNYTN-SGSIN-MTMLA-DZALG
start_port:771	end_port:346

num13:CNSHK-PKQCT
start_port:771	end_port:657

num14:CNYTN-PAONX
start_port:771	end_port:175

num15:CNYTN-MXZLO
start_port:771	end_port:63

num16:CNSHK-SGSIN
start_port:771	end_port:726

num17:CNYTN-GBFXT
start_port:771	end_port:338

num18:CNSHK-SGSIN
start_port:771	end_port:726

num19:CNSHK-SGSIN
start_port:771	end_port:726

num20:CNYTN-NZAKL
start_port:771	end_port:876

num21:C

start_port:769	end_port:642

num18:CNSHK-TRYAR
start_port:771	end_port:519

num19:CNYTN-CAVAN
start_port:771	end_port:19

num20:CNSHK-IDJKT
start_port:771	end_port:743

num21:CNYTN-VNVUT-SGSIN-FRLEH
start_port:771	end_port:333

num22:CNSHK-ESVAL
start_port:771	end_port:329

num23:CNYTN-NZAKL
start_port:771	end_port:876

num24:CNYTN-BRSSZ
start_port:771	end_port:279

num25:CNSHK-THLCH
start_port:771	end_port:715

num26:CNYTN-GRPIR
start_port:771	end_port:489

num27:CNYTN-NZAKL
start_port:771	end_port:876

num28:CNYTN-VNVUT-SGSIN-FRLEH
start_port:771	end_port:333

num29:CNYTN-GBFXT
start_port:771	end_port:338

num30:CNYTN-CAVAN
start_port:771	end_port:19

num31:CNYTN-GBFXT
start_port:771	end_port:338

num32:CNSHK-EGPSD
start_port:771	end_port:539

num33:CNSHK-CLSAI
start_port:771	end_port:218

num34:CNSHK-IDJKT
start_port:771	end_port:743

num35:CNSHK-TRYAR
start_port:771	end_port:519

num36:CNYTN-ESVAL
start_port:771	end_port:329

num37:CNSHK-KRINC
start_port:771	end_port:829

num38:CNY