# AIS Data Ingestion 

Vessel locations data is ingested from the Automatic Identification System (AIS) data available from the federal [Marine Cadastre website](https://hub.marinecadastre.gov/pages/vesseltraffic), and is processed in the following steps:
- read data from csv urls 
- drop unnessary columns
- filter to only include cargo vessels
- cast datatypes appropriately
- append to a monthly file for storage
- save monthly files to parquet

In [1]:
#preliminaries
import numpy as np
import pandas as pd
import polars as pl

#enable string cache for polars categoricals
pl.enable_string_cache()
#display settings
pd.set_option('display.max_columns', None)

In [2]:
#init globals

#dates
years = pl.arange(2015,2025,eager=True)
months = pl.arange(1,13,eager=True)
days = pl.arange(1,32,eager=True)

#vessel types
cargo_types = pl.arange(70,80,eager=True)

#monthly df
month_df = pl.DataFrame()

In [3]:
#loop through years
for year in [2024]:
    #loop through months
    for month in months:
        #loop through days
        for day in days:
            #load from url to pandas df
            try:
                day_df = (
                    pd.read_csv(f'https://coast.noaa.gov/htdata/CMSP/AISDataHandler/{year}/AIS_{year}_{month:02d}_{day:02d}.zip')
                )
                print(f'Download complete for {year}_{month}_{day}.')
            except:
                print(f'Invalid URL for {year}_{month}_{day} - Date may be invalid or file may not exist.')
                continue
            #convert to polars ;)
            day_df = pl.DataFrame(day_df)
            #process data
            day_df = (
                day_df
                #keep only cargo vessels
                .filter(pl.col('VesselType').is_in(cargo_types))
                #keep cols of interest
                .select('MMSI', 'BaseDateTime','LAT', 'LON', 'SOG', 'COG', 
                        'Heading', 'VesselName', 'IMO')
                #give pythonic names
                .rename({
                    'MMSI':'mmsi',
                    'BaseDateTime':'time',
                    'LAT':'lat',
                    'LON':'lon',
                    'SOG':'speed',
                    'COG':'course',
                    'Heading':'heading',
                    'VesselName':'vessel_name',
                    'IMO':'imo'
                })
                #clean cols
                .with_columns(
                    #strip IMO prefix and cast to int
                    imo = pl.col('imo').str.strip_prefix('IMO').cast(pl.Int64),
                    #clean course and heading 
                    course = pl.col('course').replace(360.0,None),
                    heading = pl.col('heading').replace(511.0,None)
                )
                #cast
                .cast({
                    'time':pl.Datetime,
                    'vessel_name':pl.Categorical
                })
            )
            #concat and deduplicate
            month_df = pl.concat([month_df,day_df], how='diagonal').unique()
        #save monthly data
        month_df.write_parquet(f'data/clean parquet/{year}_{month}.parquet')
        print(f'{year}_{month} file saved to parquet.')