In [2]:
# 生成train_data11.csv
# 补全数据清洗后确实数据
# 放宽清洗条件：只需路径和起始港和终点港匹配,距离小于500km（一般都小于200KM, 500KM是布达佩斯等内港口距离）
# 步骤二： 添加订单至训练集

import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')

port_data_path = '../../../dataset/test_data/port.csv'
data_path = '../../../dataset/train_data/add_data1.csv'

start = time.process_time()
port_df = pd.read_csv(port_data_path)

# -------------------------- 出发港和目的港是否和路径匹配 ------------------------------- #
# 检查出发港和目的港是否正确
def check_OD(train_df):
    train_data = pd.DataFrame()
    
    Orders = train_df.loadingOrder.unique()
    for i in range(len(Orders)):
        order_data = train_df.loc[train_df['loadingOrder'] == Orders[i]].reset_index(drop=True)
        print('\n{}/{}\t{}'.format(i, len(Orders), Orders[i]))
        print(order_data.loc[0, 'TRANSPORT_TRACE'])
        
        if (order_data.loc[0, 'TRANSPORT_TRACE'] != -1):
            trace = order_data.loc[0, 'TRANSPORT_TRACE'].split('-')

            # print('trace:%s' % (trace))
            start_port = trace[0]
            end_port = trace[-1]
            start_lon, start_lat = get_port_position(start_port)
            end_lon, end_lat = get_port_position(end_port)
            
            # 清洗超过终点港口行驶数据
#             order_data = wash_order(order_data, end_lon, end_lat)

            trace_start_lon = order_data.loc[0, 'longitude']  # 经度
            trace_start_lat = order_data.loc[0, 'latitude']  # 纬度

            trace_end_lon = order_data.loc[len(order_data) - 1, 'longitude']
            trace_end_lat = order_data.loc[len(order_data) - 1, 'latitude']

            start_dis = distance(start_lon, start_lat, trace_start_lon, trace_start_lat)
            end_dis = distance(end_lon, end_lat, trace_end_lon, trace_end_lat)
            print('start_dis:{}\tend_dis{}'.format(start_dis, end_dis))

            if (start_dis < 500) and (end_dis < 500):
                print('True')
                train_data = train_data.append(order_data, ignore_index = True)
    
    train_data = train_data[train_df.columns]
    train_data = train_data.sort_values(['loadingOrder', 'timestamp'], ascending = True)
    return train_data


def wash_order(df, end_lon, end_lat):
    dis_list = []
    for i in range(len(df)):
        LonA = df.loc[i, 'longitude']
        LatA = df.loc[i, 'latitude']
        
        dis = distance(LonA, LatA, end_lon, end_lat)
        dis_list.append(dis)
    
    index = dis_list.index(min(dis_list))
    return df.iloc[: index+1, :].reset_index(drop = True)

def get_port_position(port):
    for i in range(len(port_df)):
        if (port_df.loc[i, 'TRANS_NODE_NAME'] == port) or (port in port_df.loc[i, 'TRANS_NODE_NAME']):
            port_lon = port_df.loc[i, 'LONGITUDE']
            port_lat = port_df.loc[i, 'LATITUDE']
            
            return port_lon, port_lat

            
def distance(LonA, LatA, LonB, LatB):
    EARTH_RADIUS = 6378.137  # 千米

    def rad(d):
        return d * np.pi / 180.0

    radLatA = rad(LatA)
    radLatB = rad(LatB)

    a = radLatA - radLatB
    b = rad(LonA) - rad(LonB)

    s = 2 * np.arcsin(
        np.sqrt(np.power(np.sin(a / 2), 2) + np.cos(radLatA) * np.cos(radLatB) * np.power(np.sin(b / 2), 2)))
    s = s * EARTH_RADIUS

    #  保留两位小数
    s = np.round(s * 100) / 100
    return s


# 清洗换船数据
def process_raw(df):
    train_df = pd.DataFrame()
    orders = df.loadingOrder.unique()
    for i in range(len(orders)):
        order_df = df[df['loadingOrder'] == orders[i]].reset_index(drop = True)
        start_vessel = order_df.loc[0, 'vesselMMSI']
        if (order_df.loc[0, 'TRANSPORT_TRACE'] == 'CNSHK-BZBZE'):
            print('sss')
            order_df['vesselMMSI'] = start_vessel
            
        order_df = order_df[order_df['vesselMMSI'] == start_vessel].reset_index(drop = True)
        train_df = train_df.append(order_df, ignore_index = True)
    train_df = train_df[df.columns]
    train_df = train_df.sort_values(['loadingOrder', 'timestamp'], ascending = True)
    return train_df


# 有些同一条船行驶轨迹，可以由几个订单拼接成相对完整的订单
def joint_order(df):
    joint_data = pd.DataFrame()
    
    # 'CNNSA-MYTPP-SGSIN-ZACPT-CGPNR-GALBV-CMKBI'
    df1 = df[df['loadingOrder'].isin (['BB481216170682', 'ZM269419926061'])]
    df1['loadingOrder'] = 'BB481216170682'

    # 'CNYTN-HRRIJ'
    df2 = df[df['loadingOrder'].isin (['JG899236270837', 'OW246246833326'])].reset_index(drop = True)
    df2['loadingOrder'] = 'JG899236270837'
    df2['vesselMMSI'] = df2.loc[0, 'vesselMMSI']
    
    df3 = df[df['loadingOrder'].isin (['EP333215739038', 'KN790971115916'])]
    df3['loadingOrder'] = 'EP333215739038'
    
    # 'CNSHK-CLSAI'
    df4 = df[df['loadingOrder'].isin (['YL465830871055'])]
    
    joint_data = pd.concat([df1, df2, df3, df4], axis = 0)
    return joint_data


raw_data = pd.read_csv(data_path)

# 拼接订单
joint_df = joint_order(raw_data)
print(joint_df.loadingOrder.unique())

# 获取完整路径订单
raw_data = process_raw(raw_data)
train_data = check_OD(raw_data)

# 添加拼接订单数据
train_data = train_data.append(joint_df, ignore_index = True).reset_index(drop = True)
train_data = train_data.sort_values(['loadingOrder', 'timestamp'], ascending = True)

print('{}个快递运单'.format(train_data.loadingOrder.nunique()))
print('{}个路由'.format(train_data.TRANSPORT_TRACE.nunique()))
print('路由{}'.format(train_data.TRANSPORT_TRACE.unique()))
train_data.to_csv('../../../dataset/train_data/train_wash10.csv', index=False, header=None)

print('total running time: {} seconds'.format(time.process_time() - start))

['BB481216170682' 'JG899236270837' 'EP333215739038' 'YL465830871055']
sss
sss

0/47	AD630764258424
CNYTN-VNVUT-SGSIN-FRLEH
start_dis:2579.45	end_dis6781.38

1/47	AN816773314999
CNYTN-MTMLA
start_dis:1.6	end_dis8.67
True

2/47	BB481216170682
CNNSA-MYTPP-SGSIN-ZACPT-CGPNR-GALBV-CMKBI
start_dis:12169.36	end_dis24.17

3/47	BM229728180245
CNNSA-MYTPP-SGSIN-ZACPT-CGPNR-GALBV-CMKBI
start_dis:12169.36	end_dis24.17

4/47	CH960460352682
CNYTN-VNVUT-SGSIN-FRLEH
start_dis:7162.96	end_dis3361.0

5/47	CV204130576045
CNYTN-VNVUT-SGSIN-FRLEH
start_dis:2.1	end_dis10887.71

6/47	EN166717805171
CNYTN-MTMLA
start_dis:1.34	end_dis8.71
True

7/47	EP333215739038
CNYTN-HRRIJ
start_dis:4849.4	end_dis1119.81

8/47	FJ182986121226
CNSHK-HKHKG-TWKHH-CNNBG-CNSHA-CNTAO-KRPUS-MXZLO-PABLB-PAMIT-COCTG-JMKIN-DOCAU
start_dis:13.81	end_dis0.85
True

9/47	FU671574873350
CNYTN-HRRIJ
start_dis:5712.33	end_dis2437.38

10/47	GG443276479484
CNSHK-HKHKG-TWKHH-CNNBG-CNSHA-CNTAO-KRPUS-MXZLO-PABLB-PAMIT-COCTG-JMKIN-DOCAU
start_dis: