## Updated data

In [1]:
import pandas as pd
import numpy as np

In [2]:
#Read file and remove extra columns
df = pd.read_csv("Chelsea Bridge 2.csv", encoding= 'unicode_escape',skiprows=1)
df = df.loc[:, ['Vehicle  ID', 'Lift ID', 'Vessel(s)', 'ETA Bridge', 'Direction',
       'Email Sent', 'Advanced Notice', '2 Hours Notice?', 'Notes',
       'Start Time', 'End Time', 'Duration', 'Direction.1', 'Vessel(s).1']].copy()           

In [3]:
print(f'{df.shape} \n\n {df.dtypes}')
#An extra 641 data points were added

(4479, 14) 

 Vehicle  ID        object
Lift ID            object
Vessel(s)          object
ETA Bridge         object
Direction          object
Email Sent         object
Advanced Notice    object
2 Hours Notice?    object
Notes              object
Start Time         object
End Time           object
Duration           object
Direction.1        object
Vessel(s).1        object
dtype: object


Data Cleaning

In [5]:
df.isnull().sum

<bound method NDFrame._add_numeric_operations.<locals>.sum of       Vehicle  ID  Lift ID  Vessel(s)  ETA Bridge  Direction  Email Sent  \
0           False     True      False       False      False       False   
1           False     True      False       False      False       False   
2           False     True      False       False      False       False   
3           False     True      False       False      False       False   
4           False     True      False       False      False       False   
...           ...      ...        ...         ...        ...         ...   
4474        False    False      False       False      False       False   
4475        False    False      False       False      False       False   
4476        False    False      False       False      False       False   
4477        False    False      False       False      False       False   
4478        False    False      False       False      False       False   

      Advanced Notice  2 

The column Notes is entire useless, therefore should be dropped. Otherwise, in general, expected to drop around 500 columns. 

In [6]:
df.drop(columns=['Notes'],axis=1,inplace = True)

In [7]:
pd.set_option("display.max_rows", 100)
for col in ['Vessel(s)', 'Direction','Email Sent', 'Advanced Notice', '2 Hours Notice?', 'Duration', 'Direction.1', 'Vessel(s).1']:
    print(df[col].value_counts(),"\n")

1 TUG                             991
1 TUG / BARGE                     942
3 TUGS / TANKER                   582
3 TUG                             486
2 TUG                             153
1 TUG / 1 BARGE                   143
1 TUG / BARGE                     125
3 TUGS                            107
3 TUG / TANKER                     59
1 TUG/ BARGE                       58
2 TUGS                             40
WORK BARGE / BOAT                  33
WORK BARGE/BOAT                    27
1 TUG                              20
2 TUG / BARGE                      14
3 TUG / BARGE                      12
3 TUG                               8
3 TUG / 1 BARGE                     7
2 TUGS / BARGE                      7
3 TUGS / BARGE                      5
4 TUG                               4
2 TUGS / 2 BARGES                   4
BARGE / 3 TUGS                      4
1 TUG/BARGE                         3
2 TUGS / TANKER                     3
I TUG                               2
2 TUGS\n1 TU

The Duration column has variable incomplete data which correspond to either Start Time or End Time being null according to the Excel formula. Removing rows with start time being null because it is the target variable. And completely empty rows or column

In [8]:
#Remove Incomplete data
df = df.dropna(how='all',axis=0).copy()
remove_index = df[df['Duration']=='Incomplete Data'].index
df = df.drop(labels=remove_index, axis = 0).copy()
df

#Extra 527 after the removal

Unnamed: 0,Vehicle ID,Lift ID,Vessel(s),ETA Bridge,Direction,Email Sent,Advanced Notice,2 Hours Notice?,Start Time,End Time,Duration,Direction.1,Vessel(s).1
1,19002,,1 TUG / BARGE,5/6/2019 15:30,IN,5/6/2019 5:56,9:34,ACCEPTABLE,5/6/2019 15:12,5/6/2019 15:30,0:18,IN,Freedom / Christian Reiner / RTC 145
2,19002,,1 TUG,5/6/2019 16:30,OUT,5/6/2019 5:56,10:34,ACCEPTABLE,5/6/2019 16:02,5/6/2019 16:20,0:18,OUT,Freedom
3,19002,,1 TUG,5/7/2019 18:30,IN,5/7/2019 4:08,14:22,ACCEPTABLE,5/7/2019 17:52,5/7/2019 18:06,0:14,IN,Freedom / DBL 104 / Denali
4,19002,,1 TUG / BARGE,5/7/2019 19:00,OUT,5/7/2019 4:08,14:52,ACCEPTABLE,5/7/2019 18:25,5/7/2019 18:38,0:13,OUT,Freedom / DBL 104 / Denali
5,19003,,1 TUG / BARGE,5/7/2019 19:45,IN,5/7/2019 4:08,15:37,ACCEPTABLE,5/7/2019 18:55,5/7/2019 19:10,0:15,IN,Freedom
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4474,20641,3605,1 TUG,3/7/2022 7:15,OUT,3/7/2022 5:20,1:55,ACCEPTABLE,,,,,
4475,20641,3606,1 TUG,3/7/2022 18:10,IN,3/7/2022 17:24,0:46,TOO LATE,,,,,
4476,20641,3607,1 TUG/ BARGE,3/7/2022 18:50,OUT,3/7/2022 17:24,1:26,ACCEPTABLE,,,,,
4477,20642,3608,1 TUG/ BARGE,3/8/2022 7:25,IN,3/8/2022 5:12,2:13,ACCEPTABLE,,,,,


In [10]:
#Removing Null start time
remove_index = df[df['Start Time'].isnull()].index
df = df.drop(labels=remove_index, axis = 0).copy()
df.isnull().sum()

Vehicle  ID        506
Lift ID            692
Vessel(s)          506
ETA Bridge         506
Direction          536
Email Sent         513
Advanced Notice    515
2 Hours Notice?    515
Start Time           0
End Time             0
Duration             7
Direction.1         63
Vessel(s).1          1
dtype: int64

Furthermore, since the ETA bridge is one of the important indicator as such null values will be removed.

In [11]:
remove_index = df[df['ETA Bridge'].isnull()].index
df = df.drop(labels=remove_index, axis = 0).copy()
df.isnull().sum()

#At this point the majority of the null values and incomplete data has been removed with 2871 rows of data left. 
#However there are still some stuff left to clean such as Vessel, Direction
#But not a lot of value so not a big issue for now

Vehicle  ID          6
Lift ID            199
Vessel(s)            4
ETA Bridge           0
Direction           30
Email Sent          11
Advanced Notice      9
2 Hours Notice?      9
Start Time           0
End Time             0
Duration             0
Direction.1         25
Vessel(s).1          0
dtype: int64

In [12]:
df.shape
#Still extra 453 data rows

(3324, 13)

In [14]:
#Export the cleaned CSV 
#df.to_csv("Chelsea Bridge Final 2.csv",index = False)

Convert three times columns to datetime data type

In [10]:
df['Start Time'] = pd.to_datetime(df['Start Time'])
df['End Time'] = pd.to_datetime(df['End Time'])
df['ETA Bridge'] = pd.to_datetime(df['ETA Bridge'])

Finding the list of all vessel name

In [11]:
def unique_str(col):
    import re
    res = []
    for cell in col: 
        if type(cell) != float('NaN'):
            str = cell.lower()
            str = re.sub("[0-9]","",str) 
            for e in str.split("/"):
                e = e.strip()
                if e not in res:
                    res.append(e)
    return res

unique_str(df['Vessel(s).1'])
        

['freedom',
 'christian reiner',
 'rtc',
 'dbl',
 'denali',
 'elens bouchard',
 'b.no.',
 'vincent d. tibbestts',
 'elens bachard',
 'herald reinauer',
 'liberty',
 'great eastern',
 'first responder',
 'herald reiaver',
 'vincent d. tibbestts, jr',
 'ruth m. reinauer',
 'b no',
 'morton s. boushard jr',
 'ruth reinauer',
 "nor'easter",
 'harold a reinauer',
 'morton s. bouchard',
 'austin reinuaer',
 'vincent d tibbetls jr',
 'vincent d tibbetls',
 'fredrick basehard',
 'bn',
 'gm',
 'genesis eagle',
 'acadian',
 'harold a reinvaer',
 'harold a reinauer ii',
 'vincent d tibbets',
 'herald a. reinauer',
 'gracie m. reinauer',
 'b. no.',
 '',
 'evening star',
 'nicole leigh reinaue',
 'great easter',
 'vincent d. tibbets, jr',
 'nicole leigh reinauer',
 'harold a. reinauer ii',
 'austin reinauer',
 'harold a. reinauer',
 'christian reinvaer',
 'bouchard',
 'evening mist',
 'christian reinauer',
 'vincent d tibbets jr',
 'new england',
 'vincent d tibetts jr',
 'rhea i bouchard',
 'dean 

There a lot of spellings errors and the name for different vessels.

In [19]:
df.describe

<bound method NDFrame.describe of      Vehicle  ID Lift ID        Vessel(s)          ETA Bridge Direction  \
1          19002     NaN    1 TUG / BARGE 2019-05-06 15:30:00        IN   
2          19002     NaN            1 TUG 2019-05-06 16:30:00       OUT   
3          19002     NaN            1 TUG 2019-05-07 18:30:00        IN   
4          19002     NaN    1 TUG / BARGE 2019-05-07 19:00:00       OUT   
5          19003     NaN    1 TUG / BARGE 2019-05-07 19:45:00        IN   
...          ...     ...              ...                 ...       ...   
3622       20440    2876  3 TUGS / TANKER 2021-08-10 05:35:00       OUT   
3623       20441    2877    1 TUG / BARGE 2021-08-10 07:05:00        IN   
3624       20441    2878    1 TUG / BARGE 2021-08-10 07:40:00       OUT   
3629       20442    2880    1 TUG / BARGE 2021-08-11 20:00:00       OUT   
3631       20443    2881    1 TUG / BARGE 2021-08-11 22:30:00        IN   

           Email Sent Advanced Notice 2 Hours Notice?          St