In [None]:
# 用于生成训练集train_data(train_data0.csv-train_data10.csv)
# (1)去除路径中两点之间距离过大的的订单, 经纬度差值在15以内，特殊情况（180->-180, -180->180）大于300
# (2)检查订单路径和路由起终点是否匹配（距离小于50KM）

import pandas as pd
import numpy as np
import time
import warnings
warnings.filterwarnings('ignore')

port_data_path = '../../dataset/raw_data/port.csv'

start = time.process_time()
port_df = pd.read_csv(port_data_path)

# --------- 去除路径中两点之间距离过大的的订单（经度之间：15；维度之间：15 ） --------- #
def clean_data(raw_data):
    tmp = raw_data.groupby('loadingOrder')

    # 特征只选择经纬度、速度\方向
    raw_data['lat_diff'] = abs(tmp['latitude'].diff(1))
    raw_data['lon_diff'] = abs(tmp['longitude'].diff(1))

    agg_function = ['max']
    agg_col = ['lat_diff', 'lon_diff']

    group = tmp[agg_col].agg(agg_function).reset_index()
    group.columns = ['loadingOrder'] + ['{}_{}'.format(i, j) for i in agg_col for j in agg_function]

    raw_data = pd.merge(raw_data, group)
    
    # 相邻记录经纬度差值在15以内，特殊情况（180->-180, -180->180）大于300
    raw_data = raw_data.loc[(raw_data['lat_diff_max'] < 15) & ((raw_data['lon_diff_max'] < 15) |
                            (raw_data['lon_diff_max'] > 300))]


    raw_data = raw_data.drop(['lat_diff', 'lon_diff', 'lat_diff_max', 'lon_diff_max'], axis=1)

    return raw_data

# -------------------------- 出发港和目的港是否和路径匹配 ------------------------------- #
# 检查出发港和目的港是否正确
def check_OD(train_df):
    wash_Orders = []
    Orders = train_df.loadingOrder.unique()
    for i in range(len(Orders)):
        order_data = train_df.loc[train_df['loadingOrder'] == Orders[i]].reset_index(drop = True)
        print('\n{}/{}'.format(i, len(Orders)))

        print(order_data.loc[0, 'TRANSPORT_TRACE'])
        if (order_data.loc[0, 'TRANSPORT_TRACE'] != -1):
            trace = order_data.loc[0, 'TRANSPORT_TRACE'].split('-')
            
            # print('trace:%s' % (trace))
            trace_origin_port = trace[0]
            trace_desti_port = trace[-1]

            trace_origin_lon = order_data.loc[0, 'longitude']  # 经度
            trace_origin_lat = order_data.loc[0, 'latitude']  # 纬度

            trace_desti_lon = order_data.loc[len(order_data)-1, 'longitude']
            trace_desti_lat = order_data.loc[len(order_data)-1, 'latitude']

            flag1 = check_position(trace_origin_port, trace_origin_lon, trace_origin_lat)
            flag2 = check_position(trace_desti_port, trace_desti_lon, trace_desti_lat)

            if (flag1) and (flag2):
                # print('True')
                wash_Orders.append(Orders[i])
    return wash_Orders

def check_position(port, LonA, LatA):
    for i in range(len(port_df)):
        if (port_df.loc[i, 'TRANS_NODE_NAME'] == port) or (port in port_df.loc[i, 'TRANS_NODE_NAME']):
            # print(port)
            port_lon = port_df.loc[i, 'LONGITUDE']
            port_lat = port_df.loc[i, 'LATITUDE']

            dis = distance(LonA, LatA, port_lon, port_lat)
            # print('dis:%f'%(dis))
            if (dis < 50):
                return True
            else:
                return False


def distance(LonA, LatA, LonB, LatB):
    EARTH_RADIUS = 6378.137  # 千米

    def rad(d):
        return d * np.pi/ 180.0

    radLatA = rad(LatA)
    radLatB = rad(LatB)

    a = radLatA-radLatB
    b = rad(LonA)-rad(LonB)

    s = 2 * np.arcsin(np.sqrt(np.power(np.sin(a / 2),2)+ np.cos(radLatA) * np.cos(radLatB)*np.power(np.sin(b / 2),2)))
    s = s* EARTH_RADIUS
    
    #  保留两位小数
    s = np.round(s * 100)/100
    return s


for i in range(10):
    print('i:{}'.format(i))
    raw_data = pd.read_csv('../../dataset/chunk_data/train_data{}.csv'.format(i), header = None)
    raw_data.columns = ['loadingOrder', 'carrierName', 'timestamp', 'longitude',
                          'latitude', 'vesselMMSI', 'speed', 'direction', 'vesselNextport',
                          'vesselNextportETA', 'vesselStatus', 'vesselDatasource', 'TRANSPORT_TRACE']
    print('1: {}个快递运单'.format(raw_data.loadingOrder.nunique()))
    
    # 清洗两点间距离过大订单
    raw_data = clean_data(raw_data)
    print('2: {}个快递运单'.format(raw_data.loadingOrder.nunique()))
    
    # 检查订单路径和路由起终点是否匹配（距离小于50KM）
    wash_Orders = check_OD(raw_data)
    raw_data = raw_data.loc[raw_data['loadingOrder'].isin (wash_Orders)]
    print('3: {}个快递运单'.format(raw_data.loadingOrder.nunique()))
    print('raw_data\n%s'%(raw_data))
    
    file_name = '../../dataset/train_data_new/train_wash{}.csv'.format(i)
    raw_data.to_csv(file_name,index = False, header = None)
    
    print('process time: {} seconds'.format(time.process_time() - start))

print('total running time: {} seconds'.format(time.process_time() - start))