# All features

Bring all features together into a single dataset

In [1]:
# base libraries
import pandas as pd
import numpy as np
import math
import os
import json

In [2]:
# set variable from config file
config_path = os.path.abspath('..')

with open(config_path + '/config.json', 'r') as f:
    config = json.load(f)

processing_path = config['DEFAULT']['processing_path']
shipping_rot_filename = config['DEFAULT']['shipping_rot_filename']
segment_filename = config['DEFAULT']['segment_filename']
port_loading_filename = config['DEFAULT']['port_loading_filename']
seasonality_filename = config['DEFAULT']['seasonality_filename']
delay_filename = config['DEFAULT']['delay_filename']
weather_data_filename = config['DEFAULT']['weather_data_filename']
ship_dynamics_filename = config['DEFAULT']['ship_dynamics_filename']
features_filename = config['DEFAULT']['features_filename']

In [3]:
# import data
dtype_dic = {'MMSI':int,'dt':'str', 'lat':'float', 'long':'float','SOG':'float', 'rot':'float', 
             'Type':'str', 'gross_tonnage':'float','vessel_name':'str', 'ETA':'str', 'POC_LOCODE':'str',
             'last_port_LOCODE':'str', 'next_port_LOCODE':'str', 'status':'str','voyage_id':'float','tripid':int,
            'in_hazmat':'str','out_hazmat':'str'}
parse_dates = ['dt', 'ETA']

shipping_data = pd.read_csv(processing_path + shipping_rot_filename,header = 0,delimiter = ',',dtype = dtype_dic, parse_dates=parse_dates)

## Merge delays data

In [4]:
# import delays data
delays = pd.read_csv(processing_path + delay_filename, header = 0,delimiter = ',')
delays['dt'] = pd.to_datetime(delays['dt'])

In [5]:
shipping_data = shipping_data.merge(delays[['MMSI','dt','arrivalDelay','arrivalDelayMin','delay15','delay30',
                                            'delay60','delay90','delay120','previous_delays','ETA_new']], how = 'inner', 
                                    on = ['MMSI','dt'])

## Ship Type

In [6]:
shipping_data['Type'] = shipping_data['Type'].map({'tug':'tug', 'container ship':'container', 
                                                   'pilot':'pilot', 'ro-ro cargo ship':'cargo_ship', 
                                                   'hopper dredger':'dredger', 
                                                   'general cargo ship':'general_cargo', 
                                                   'passenger/ro-ro cargo ship':'passenger', 
                                                   'work/repair vessel':'work_vessel', 'sar':'sar', 
                                                   'unknown type':'unknown'})

# create one hot encoding for each ship type
one_hot = pd.get_dummies(shipping_data['Type'],prefix = 'type')

In [7]:
# merge dummy fields back on
shipping_data = shipping_data.merge(one_hot, left_index = True,right_index = True, how = 'inner') 

## Segments

In [8]:
segments = pd.read_csv(processing_path + segment_filename, header = 0,delimiter = ',')
segments['dt'] = pd.to_datetime(segments['dt'])

In [9]:
segments['seg_desc'] = segments['segment'].map({0:'Trans_general', 1:'Docking_terminal',2:'Docking_mid', 3:'Docked',
                                               4:'Trans_border', 5:'Docking_inital'})

# create one hot encoding for each segment
one_hot_seg = pd.get_dummies(segments['seg_desc'],prefix = 'seg')

In [10]:
# merge dummy fields back on
shipping_data = shipping_data.merge(one_hot_seg, left_index = True,right_index = True, how = 'inner') 

## Seasonality

In [11]:
seasonality = pd.read_csv(processing_path + seasonality_filename, header = 0,delimiter = ',')
seasonality['dt'] = pd.to_datetime(seasonality['dt'])

In [12]:
shipping_data = shipping_data.merge(seasonality, how = 'inner', on = ['MMSI','dt'])

## Merge port loading data

In [13]:
port_loading = pd.read_csv(processing_path + port_loading_filename, header = 0,delimiter = ',')
port_loading['start_window'] = pd.to_datetime(port_loading['start_window'])
port_loading['end_window'] = pd.to_datetime(port_loading['end_window'])

In [14]:
shipping_data['start_window'] = shipping_data['dt'].dt.floor('1min')
shipping_data['end_window'] = shipping_data['dt'].dt.ceil('1min')

In [15]:
shipping_data = shipping_data.merge(port_loading, on = ['start_window','end_window'], how = 'inner')

## Merge weather data

In [16]:
weather = pd.read_csv(processing_path + weather_data_filename, header = 0,delimiter = ',')
weather['YEARMODA'] = pd.to_datetime(weather['YEARMODA'])
weather['YEARMODA'] = weather['YEARMODA'].dt.date

In [17]:
# give some more meaningful names
rename_dict = {'TEMP':'temp',
               'DEWP':'dew_point',
               'SLP':'sea_lev_pressure',
               'STP':'station_pressure',
               'VISIB':'visibility',
               'WDSP':'wind_speed',
               'MXSPD':'max_wind_speed',
               'MAX':'max_temp',
               'MIN':'min_temp',
               'FRSHTT':'fog_rain_driz_snow_ice'}
weather.rename(columns = rename_dict, inplace = True)
weather.drop(['STN'],axis = 1,inplace = True)

In [18]:
# add the merge key to the features dataset
shipping_data['YEARMODA'] = shipping_data['dt'].dt.date

In [19]:
shipping_data = shipping_data.merge(weather, on = 'YEARMODA', how = 'inner')

## Ship dynamics

In [20]:
ship_dynamics = pd.read_csv(processing_path + ship_dynamics_filename, header = 0,delimiter = ',')
ship_dynamics['dt'] = pd.to_datetime(ship_dynamics['dt'])

In [21]:
shipping_data = shipping_data.merge(ship_dynamics[['MMSI','dt','accel','port_dist']], how = 'left', on = ['MMSI','dt'])

## Export Data

In [22]:
shipping_data.to_csv(processing_path + features_filename,header=True,index=False,sep=',')

In [23]:
shipping_data.describe()

Unnamed: 0,MMSI,SOG,gross_tonnage,lat,long,voyage_id,tripid,rot,arrivalDelayMin,previous_delays,...,sea_lev_pressure,station_pressure,visibility,wind_speed,max_wind_speed,max_temp,min_temp,fog_rain_driz_snow_ice,accel,port_dist
count,1131759.0,1131759.0,1131759.0,1131759.0,1131759.0,1131759.0,1131759.0,1128591.0,1131759.0,561732.0,...,1125808.0,1125808.0,1125808.0,1125808.0,1125808.0,1125808.0,1125808.0,1125808.0,1130161.0,1003960.0
mean,376310300.0,1.390241,48585.76,51.95493,1.30166,1088642.0,27.17542,5.388764,98.5513,2.762917,...,1019.017,1018.771,34.5048,8.663079,13.50797,14.62448,7.401772,15977.55,0.0001286545,1771.862
std,166618300.0,3.461391,58401.9,0.006938276,0.008656976,36079.77,64.7306,7.714307,1028.975,4.38449,...,9.430611,9.434266,138.0212,2.910903,3.776399,6.52729,5.751391,31209.08,0.1166457,2673.516
min,209322000.0,0.0,0.0,51.93531,1.281933,1001116.0,1.0,0.0,-12360.0,0.0,...,988.2,987.8,0.3,1.8,5.1,1.0,-5.0,0.0,-14.6,0.0
25%,229928000.0,0.0,6326.0,51.94953,1.294683,1064553.0,4.0,0.2330485,-57.0,0.0,...,1014.0,1013.7,9.4,6.5,11.1,10.0,3.0,0.0,0.0,250.621
50%,351819000.0,0.0,24196.0,51.95705,1.302205,1090350.0,8.0,1.930199,11.0,1.0,...,1019.3,1019.0,16.4,8.2,13.0,14.0,7.0,10000.0,0.0,605.919
75%,563108000.0,1.4,72884.0,51.96029,1.30936,1116384.0,20.0,7.910323,126.0,3.0,...,1025.3,1025.0,20.9,10.2,15.9,21.0,12.0,10000.0,0.0,2187.71
max,636092600.0,102.3,210000.0,51.96582,1.32085,1163320.0,431.0,346.9213,43025.0,24.0,...,1043.3,1043.0,999.9,20.3,30.9,30.0,19.0,110000.0,12.7875,13367.53
