# AIS Data Ingestion 

Vessel locations data is ingested from the Automatic Identification System (AIS) data available from the federal [Marine Cadastre website](https://hub.marinecadastre.gov/pages/vesseltraffic), and is processed in the following steps:
- read data from csv urls 
- drop unnessary columns
- filter to only include cargo vessels
- cast datatypes appropriately
- append to a monthly file for storage
- save monthly files to parquet

Descriptions of each column of the raw data are available at the [AIS Data Dictionary](https://coast.noaa.gov/data/marinecadastre/ais/data-dictionary.pdf).

In [None]:
#preliminaries
import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime 

#enable string cache for polars categoricals
pl.enable_string_cache()
#display settings
pd.set_option('display.max_columns', None)

In [None]:
#set variables
 
#start and end dates (format yyyy_mm_dd)
start_date = '2018_01_01'
end_date = '2024_03_31'

#vessel types - includes cargo and tanker types
cargo_types = pl.arange(70,90,eager=True)

In [None]:
#injest data

#init dates 
days = pl.date_range(datetime.strptime(start_date, '%Y_%m_%d'),
                  datetime.strptime(end_date, '%Y_%m_%d'),
                  eager=True)

#loop through days in range
for day in days:
    #get year
    year = day.year
    #convert day to string
    day = day.strftime('%Y_%m_%d')
    #load from url to pandas df
    try:
        day_raw = (
            pd.read_csv(
                f'https://coast.noaa.gov/htdata/CMSP/AISDataHandler/{year}/AIS_{day}.zip',
                low_memory=False
            )
        )
        print(f'Download complete for {day}.')
        try:
            #convert to polars
            day_df = pl.DataFrame(day_raw,infer_schema_length=0)
            #process data
            day_df = (
                day_df
                #keep only cargo vessels
                .filter(pl.col('VesselType').is_in(cargo_types))
                #keep cols of interest
                .select('MMSI', 'BaseDateTime','LAT', 'LON', 'SOG', 'COG', 
                        'Heading', 'Status', 'VesselName', 'VesselType', 'IMO',
                        'Length', 'Width', 'Draft','Cargo')
                #give pythonic names
                .rename({
                    'MMSI':'mmsi',
                    'BaseDateTime':'time',
                    'LAT':'lat',
                    'LON':'lon',
                    'SOG':'speed',
                    'COG':'course',
                    'Heading':'heading',
                    'Status':'status',
                    'VesselName':'vessel_name',
                    'VesselType':'vessel_type',
                    'IMO':'imo',
                    'Length':'length',
                    'Width':'width',
                    'Draft':'draft',
                    'Cargo':'cargo'
                })
                #clean cols
                .with_columns(
                    #strip IMO prefix and cast to int
                    imo = pl.col('imo').str.strip_prefix('IMO').cast(pl.Int64),
                    #clean course and heading 
                    course = pl.col('course').replace(360.0,None),
                    heading = pl.col('heading').replace(511.0,None)
                )
                #cast
                .cast({
                    'time':pl.Datetime,
                    'vessel_name':pl.Categorical
                })
                #deduplicate
                .unique()
            )
            #write to parquet
            day_df.write_parquet(f'../data/ais_clean/ais_{day}.parquet')
            print(f'{day} successfully processed and saved to parquet.')
        except:
            try:
                day_raw.to_csv(f'../data/ais_processing_errors/ais_{day}.csv')
                print(f'WARNING: Error in processing {day} data. Raw file saved to CSV instead.' )
            except:
                print(f'WARNING: Error in processing {day} data. FILE LOST.')
                continue
            continue
    except:
        print(f'Invalid URL for {day} - Date may be invalid or file may not exist.')
        continue