In [None]:
# 用于生成chunk_data
# 进行数据分块处理，按订单号的数量分成10份
# (1)去除direction为-1
# (2)数据去重，去除loadingOrder, carrierName, timestamp和vesselMMSI相同记录
import pandas as pd
import time
import warnings
warnings.filterwarnings('ignore')

start = time.process_time()
data_path = '../../dataset/raw_data/train0711.csv'
COLUMNS=["loadingOrder", "carrierName", "timestamp", "longitude", "latitude", "vesselMMSI", "speed", "direction",
         "vesselNextPort", "vesselNextportETA", "vesselStatus", "vesselDatasource", "transport_trace"]

df = pd.read_csv(data_path)
df.columns = COLUMNS
df = df.sort_values(['loadingOrder', 'timestamp'], ascending = True) # 订单排序
df['transport_trace'] = df['transport_trace'].fillna('-1')

# 去除direction为-1
df = df.loc[df['direction'] != -1]

# 数据去重，去除loadingOrder, carrierName, timestamp和vesselMMSI相同记录
df = df.drop_duplicates(["loadingOrder", "carrierName", "timestamp", "vesselMMSI"])

orders = df.loadingOrder.unique().tolist()  # 订单list

# 订单分割
n_chunk = 10
chunk_len = len(orders)//n_chunk
for chunk_index in range(n_chunk):
    print('chunk{}'.format(chunk_index))
    if(chunk_index < n_chunk -1):
        file_order = orders[chunk_index*chunk_len:(chunk_index+1)*chunk_len]
    else:
        file_order = orders[chunk_index*chunk_len:]
    file_name = '../../dataset/chunk_data/train_data{}.csv'.format(chunk_index)
    chunk_data = df.loc[df["loadingOrder"].isin(file_order)]
    
    # "vesselStatus"字符串映射为数值
    chunk_data["vesselStatus"] = chunk_data["vesselStatus"].replace(
        ["moored", "under way using engine", "not under command", "at anchor", "under way sailing",
         "constrained by her draught"], [0, 1, 2, 3, 4, 5])
    chunk_data["vesselStatus"] = chunk_data["vesselStatus"].fillna(9)  #缺失值填充

    # "vesselDatasource"字符串映射为数值
    chunk_data["vesselDatasource"] = chunk_data["vesselDatasource"].replace(
        ["Coastal AIS", "Satellite"], [0, 1])

    # "vesselStatus"数据清洗
    chunk_data = chunk_data.loc[chunk_data["vesselStatus"].isin([0, 1, 2, 3, 4, 5, 9])]
    chunk_data["vesselStatus"] = chunk_data["vesselStatus"].astype(int)

    # "vesselDatasource"数据清洗
    chunk_data = chunk_data.loc[chunk_data["vesselDatasource"].isin([0, 1])]
    chunk_data["vesselDatasource"] = chunk_data["vesselDatasource"].astype(int)

    chunk_data.to_csv(file_name,index = False, header = None)
    print('chunk data num:{}'.format(len(chunk_data)))
    
print('running time:{} secondes'.format(time.process_time() - start))