In [15]:
import pandas as pd
import numpy as np
import datetime
from pytz import timezone

In [16]:
data_dir = r"./transport/"

In [41]:
transport = pd.read_csv(data_dir + r"transport.csv", encoding = "ISO-8859-1", sep=";")
temperatures = pd.read_csv(data_dir + r"temperatures.csv", encoding = "ISO-8859-1", sep=";")

### 1. Transport

#### a. Inspect data

In [18]:
transport.shape
# We have data on 3790 transports

(3790, 16)

In [19]:
# Show first 3 rows, 4 random samples, last 3 rows
pd.concat([transport.head(3), 
           transport.sample(8, random_state=0).sort_values(by='Transport ID'), 
           transport.tail(3)])

Unnamed: 0,Transport ID,Container ID,Container Type,Forwarder,Trucking Time Start,Ground Handler 1,Airport 1,Ground Handler 2,Airport 2,Ground Handler 3,Airport 3,Ground Handler 4,Airport 4,Ground Handler 5,Airport 5,Trucking Time End
0,5559,000-10081,770C,,24.07.2013,,,,,##,##,##,##,##,##,24.07.2013
1,5560,000-10081,770C,,26.11.2013,,,,,##,##,##,##,##,##,26.11.2013
2,5561,000-10080,770C,,24.07.2013,,,,,##,##,##,##,##,##,24.07.2013
45,5604,000-10096,770C,,25.11.2013,,,,,##,##,##,##,##,##,25.11.2013
142,5701,000-10147,770C,,11.09.2013,,,,,##,##,##,##,##,##,11.09.2013
192,5751,000-10154,770C,,27.11.2013,,,,,##,##,##,##,##,##,27.11.2013
298,5857,000-10178,770C,,11.11.2013,,,,,##,##,##,##,##,##,11.11.2013
768,6422,000-10240,770C,1524,11.08.2014 22:40 +0200,,BSL,,GRU,##,##,##,##,##,##,12.08.2014 05:50
896,6677,999-10010,Prototype,Bischof,23.01.2015,,,,,##,##,##,##,##,##,23.01.2015
2382,8116,011-10007,1500C,Yusen Logistics,01.12.2016 10:00 +0000,,,,,##,##,##,##,##,##,01.12.2016 15:00 +0100


Issues:
- Trucking time date format looks good (assume UTC is implied) 
- Initially: no time recorded. Add a flag to indicate whether time was included (otherwise the time estimates from start to end will suggest a precision that is not there)
- "##"'s to indicate missing values (replace this)

#### b. Clean-up data

_ Replace ## with np.nan's _

In [20]:
## Replace "  ##" with np.nan's
transport.replace(" ##", np.nan, inplace=True)

_ Convert datetime's _

In [21]:
## Noodling around how to Convert times to datetime objects
transp_dt_10 = transport['Trucking Time Start'][10]
transp_dt_1000 = transport['Trucking Time Start'][1000]
print(transp_dt_10)
print(transp_dt_1000)
transp_dt_10_datetime = pd.to_datetime(transp_dt_10)
transp_dt_1000_datetime = pd.to_datetime(transp_dt_1000)
print('-----------------------')
print(transp_dt_10_datetime)
print(transp_dt_1000_datetime)

26.11.2013
15.07.2015 07:00 +0200
-----------------------
2013-11-26 00:00:00
2015-07-15 05:00:00


In [22]:
## Noodling around: how to make the flag
# Different options: on length, splitting
# Chosen route for now: split on colon. Time should always be indicated by a colon, and will result in a 
# split of at least two elements
print(transp_dt_10.split(':'))
print(transp_dt_1000.split(':'))

['26.11.2013']
['15.07.2015 07', '00 +0200']


In [23]:
transport['full_timestamp_flag'] = transport['Trucking Time Start'].apply(lambda x: len(x.split(':')) >1)

In [24]:
## Conversion to datetime, UTC seems to work out-of-the-box 
transport['Trucking Time Start'] = pd.to_datetime(transport['Trucking Time Start'])
transport['Trucking Time End'] = pd.to_datetime(transport['Trucking Time End'])

_ Inspect converted DataFrame _

In [25]:
pd.concat([transport.head(3), 
           transport.sample(8, random_state=0).sort_values(by='Transport ID'), 
           transport.tail(3)])

Unnamed: 0,Transport ID,Container ID,Container Type,Forwarder,Trucking Time Start,Ground Handler 1,Airport 1,Ground Handler 2,Airport 2,Ground Handler 3,Airport 3,Ground Handler 4,Airport 4,Ground Handler 5,Airport 5,Trucking Time End,full_timestamp_flag
0,5559,000-10081,770C,,2013-07-24 00:00:00,,,,,,,,,,,2013-07-24 00:00:00,False
1,5560,000-10081,770C,,2013-11-26 00:00:00,,,,,,,,,,,2013-11-26 00:00:00,False
2,5561,000-10080,770C,,2013-07-24 00:00:00,,,,,,,,,,,2013-07-24 00:00:00,False
45,5604,000-10096,770C,,2013-11-25 00:00:00,,,,,,,,,,,2013-11-25 00:00:00,False
142,5701,000-10147,770C,,2013-11-09 00:00:00,,,,,,,,,,,2013-11-09 00:00:00,False
192,5751,000-10154,770C,,2013-11-27 00:00:00,,,,,,,,,,,2013-11-27 00:00:00,False
298,5857,000-10178,770C,,2013-11-11 00:00:00,,,,,,,,,,,2013-11-11 00:00:00,False
768,6422,000-10240,770C,1524,2014-11-08 20:40:00,,BSL,,GRU,,,,,,,2014-12-08 05:50:00,True
896,6677,999-10010,Prototype,Bischof,2015-01-23 00:00:00,,,,,,,,,,,2015-01-23 00:00:00,False
2382,8116,011-10007,1500C,Yusen Logistics,2016-01-12 10:00:00,,,,,,,,,,,2016-01-12 14:00:00,True


_ That looks good for now _

### 2. Temperatures

#### a. Inspect data

In [42]:
?temperatures.sort_values

In [43]:
# Show first 3 rows, 4 random samples, last 3 rows
pd.concat([temperatures.head(3), 
           temperatures.sample(6, random_state=0), 
           temperatures.tail(3)])

Unnamed: 0,Transport ID,Time,Temperature (Internal),Temperature (External)
0,5559.0,,,
1,5560.0,,,
2,5561.0,,,
195558,,Mon Jan 08 21:56:20 CET 2018,13.2,17.8
191072,,Mon Jan 08 00:54:19 CET 2018,6.9,3.5
64821,,Wed Jan 25 16:48:26 CET 2017,5.0,4.5
61729,,Sat Jan 28 00:50:43 CET 2017,3.1,2.6
107305,,Sun Jun 18 06:58:27 CEST 2017,21.6,19.6
159006,,Wed Oct 25 23:25:47 CEST 2017,22.6,21.6
233358,,Tue Mar 06 09:32:07 CET 2018,16.6,15.4


Issues:
- It seems (after inspection in .csv) that Transport ID is shown only once, after which the timeseries follows
- Mix of Central European Time and Central European Summer Time 

https://www.timeanddate.com/time/zones/cet

NB: CET = UTC + 1. CEST = UTC + 2

#### b. Clean-up data

In [47]:
print(temperatures.Time[2000])
print('-----------------------')
print(pd.to_datetime(temperatures.Time[2000]))

Thu Sep 18 20:26:09 CEST 2014
-----------------------
2014-09-18 18:26:09


In [51]:
?temperatures.rename

Also here, pd.to_datetime does a good job converting to UTC

- Rename Time to Time_raw (raw data) in case this is needed somehow
- New column: Time



In [57]:
temperatures.rename(columns={'Time': 'Time_raw'}, inplace=True)
temperatures['Time'] = pd.to_datetime(temperatures.Time_raw)

Fill the NaN's in Transport ID with the previous values, convert to int

In [58]:
temperatures['Transport ID'] = temperatures['Transport ID'].fillna(method='ffill').astype('int')

In [59]:
pd.concat([temperatures.head(3), temperatures.sample(4), temperatures.tail(3)])

Unnamed: 0,Transport ID,Time_raw,Temperature (Internal),Temperature (External),Time
0,5559,,,,NaT
1,5560,,,,NaT
2,5561,,,,NaT
116970,8540,Sat Jul 15 05:04:45 CEST 2017,3.9,6.3,2017-07-15 03:04:45
195318,9122,Sun Jan 07 05:57:10 CET 2018,9.3,16.9,2018-01-07 04:57:10
229728,9339,Tue Feb 06 19:11:49 CET 2018,6.5,17.2,2018-02-06 18:11:49
119289,8557,Thu Jul 27 08:20:41 CEST 2017,18.9,18.3,2017-07-27 06:20:41
233358,9351,Tue Mar 06 09:32:07 CET 2018,16.6,15.4,2018-03-06 08:32:07
233359,9351,Tue Mar 06 09:42:07 CET 2018,16.7,16.1,2018-03-06 08:42:07
233360,9351,Tue Mar 06 09:52:07 CET 2018,16.7,,2018-03-06 08:52:07


 ### <font color='red'> NB: verify that forward-fill is indeed the correct approach </font>

In [31]:
# Let's not join here. Put into DB instead. 
# df = transport.join(other=temperatures.set_index('Transport ID'), on = 'Transport ID')

In [32]:
temperatures.tail(100)

Unnamed: 0,Transport ID,Time,Temperature (Internal),Temperature (External)
233261,9351,Mon Mar 05 17:22:07 CET 2018,18.7,16.4
233262,9351,Mon Mar 05 17:32:07 CET 2018,18.6,16.2
233263,9351,Mon Mar 05 17:42:07 CET 2018,18.6,16.1
233264,9351,Mon Mar 05 17:52:07 CET 2018,18.6,15.9
233265,9351,Mon Mar 05 18:02:07 CET 2018,18.6,15.8
233266,9351,Mon Mar 05 18:12:07 CET 2018,18.6,15.7
233267,9351,Mon Mar 05 18:22:07 CET 2018,18.5,15.6
233268,9351,Mon Mar 05 18:32:07 CET 2018,18.5,15.5
233269,9351,Mon Mar 05 18:42:07 CET 2018,18.5,15.4
233270,9351,Mon Mar 05 18:52:07 CET 2018,18.5,15.3
