In [76]:
import polars as pl
import pandas as pd

#enable string cache for polars categoricals
pl.enable_string_cache()
#display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pl.Config(tbl_rows=200)

<polars.config.Config at 0x31fb5ee70>

In [77]:
#AIS keys

#status codes - from Spire maritime docs
status_dict = ({
    0:'Under way using its engine',
    1:'Anchored',
    2:'Not under command',
    3:'Has restricted maneuverability',
    4:'Ship draught is limiting its movement',
    5:'Moored (tied to another object to limit free movement)',
    6:'Aground',
    7:'Engaged in fishing',
    8:'Under way sailing',
    9:'(Number reserved for modifying reported status of ships carrying dangerous goods/harmful substances/marine pollutants)',
    10:'(Number reserved for modifying reported status of ships carrying dangerous goods/harmful substances/marine pollutants)',
    11:'Power-driven vessel towing astern',
    12:'Power-driven vessel pushing ahead/towing alongside',
    13:'(Reserved for future use)',
    14:'Any of the following are active: AIS-SART (Search and Rescue Transmitter), AIS-MOB (Man Overboard), AIS-EPIRB (Emergency Position Indicating Radio Beacon)',
    15:'Undefined (default)'
})

In [78]:
lf = pl.scan_parquet('data/ais_clean/*.parquet')
lf.describe()

statistic,mmsi,time,lat,lon,speed,course,heading,status,vessel_name,vessel_type,imo,length,width,draft,cargo
str,f64,str,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64
"""count""",126162658.0,"""126162658""",126162658.0,126162658.0,126162658.0,125858380.0,102435467.0,126159218.0,"""126162658""",126162658.0,119975288.0,126162658.0,126162658.0,126162658.0,126162658.0
"""null_count""",0.0,"""0""",0.0,0.0,0.0,304278.0,23727191.0,3440.0,"""0""",0.0,6187370.0,0.0,0.0,0.0,0.0
"""mean""",388360000.0,"""2015-04-02 08:39:02.192253""",32.946635,-93.192081,5.055873,183.664976,181.837463,1.471392,,74.324693,19909000.0,164.390266,25.078355,5.817973,73.966959
"""std""",139230000.0,,9.322893,24.746577,6.568342,103.833621,104.555711,2.694564,,5.739697,78016000.0,96.866701,12.827492,5.441894,7.176486
"""min""",102810.0,"""2015-01-01 00:00:00""",9.95039,-179.99931,0.0,0.0,0.0,0.0,,70.0,1.0,0.0,0.0,-12.8,0.0
"""25%""",316003770.0,"""2015-02-06 10:22:34""",28.60272,-96.55374,0.0,101.0,91.0,0.0,,70.0,9063665.0,66.0,14.0,3.4,70.0
"""50%""",367018710.0,"""2015-03-20 22:56:52""",29.89593,-90.13038,0.2,185.1,182.0,0.0,,70.0,9266231.0,178.0,27.0,7.0,70.0
"""75%""",477261200.0,"""2015-05-17 10:15:14""",38.90618,-79.02804,11.0,271.8,271.0,1.0,,79.0,9363950.0,228.0,32.0,9.3,79.0
"""max""",942316566.0,"""2015-09-30 23:59:59""",89.4922,148.35282,102.3,399.0,510.0,15.0,,89.0,999000111.0,655.0,60.0,12.7,89.0


In [79]:
status = (
    lf.group_by('status')
    .agg(
        count = pl.col('status').count(),
        percent = pl.col('status').count()/lf.select(pl.len()).collect().item()
    )
    .collect()
)
status.sort('status')

status,count,percent
f64,u32,f64
,0,0.0
0.0,79621301,0.6311
1.0,16428126,0.130214
2.0,1167625,0.009255
3.0,1040813,0.00825
4.0,36066,0.000286
5.0,24395710,0.193367
6.0,17522,0.000139
7.0,869680,0.006893
8.0,471820,0.00374


In [81]:
monthly_df = (
    lf
    .with_columns(
        month = pl.col('time').dt.month_start().dt.date()
    )
    .group_by('month')
    .agg(
        imo_count = pl.col('imo').n_unique(),
        vessel_count = pl.col('mmsi').n_unique(),
        pings_count = pl.col('mmsi').count()
    )
    .with_columns(
        pings_per_vessel = pl.col('pings_count')/pl.col('vessel_count')
    )
    .sort('month')
    .collect()
)

monthly_df.head(10)

month,imo_count,vessel_count,pings_count,pings_per_vessel
date,u32,u32,u32,f64
2015-01-01,500,511,27377631,53576.577299
2015-02-01,451,460,21271144,46241.617391
2015-03-01,460,473,22283009,47109.955603
2015-04-01,442,454,16347708,36008.167401
2015-05-01,455,467,13633050,29192.826552
2015-06-01,399,411,10285592,25025.77129
2015-07-01,401,416,7796229,18740.935096
2015-08-01,380,394,5098984,12941.583756
2015-09-01,369,381,2069311,5431.262467


In [82]:
monthly_df.describe()

statistic,month,imo_count,vessel_count,pings_count,pings_per_vessel
str,str,f64,f64,f64,f64
"""count""","""9""",9.0,9.0,9.0,9.0
"""null_count""","""0""",0.0,0.0,0.0,0.0
"""mean""","""2015-05-01 18:40:00""",428.555556,440.777778,14018000.0,30474.299651
"""std""",,43.321793,42.51993,8514200.0,16557.636452
"""min""","""2015-01-01""",369.0,381.0,2069311.0,5431.262467
"""25%""","""2015-03-01""",399.0,411.0,7796229.0,18740.935096
"""50%""","""2015-05-01""",442.0,454.0,13633050.0,29192.826552
"""75%""","""2015-07-01""",455.0,467.0,21271144.0,46241.617391
"""max""","""2015-09-01""",500.0,511.0,27377631.0,53576.577299


Identify changes in status for each vessel and drop other observations

In [92]:
status_df = (
    lf
    #drop smaller vessels
    .filter(pl.col('length')>100)
    #select cols of interest
    .select('mmsi', 'imo', 'time', 'status')
    #sort by vessel and time
    .sort(['mmsi', 'time'])
    #indicate whether status is the same as previous row (Fill value needed to avoid status 0 evaluating as equal to false)
    .with_columns(
        status_change = (
            pl.col('status').ne(pl.col('status').shift(fill_value=20))
            .over('mmsi')
        )
    )
    #keep only new status pings
    .filter(pl.col('status_change')==True)
    #drop change col
    .drop('status_change')
    .collect()
)

In [93]:
status_df.head(100)

mmsi,imo,time,status
i64,i64,datetime[μs],f64
102810,4711,2015-07-26 02:13:42,0.0
30474600,9291975,2015-02-15 19:28:50,0.0
30474600,9291975,2015-02-16 13:10:16,1.0
30474600,9291975,2015-02-16 21:44:44,0.0
30474600,9291975,2015-02-17 05:18:24,5.0
30474600,9291975,2015-02-19 06:55:37,0.0
30474600,9291975,2015-02-20 02:21:10,5.0
30474600,9291975,2015-02-22 05:55:04,0.0
30474600,9291975,2015-02-24 02:43:45,1.0
30474600,9291975,2015-02-25 00:33:36,0.0


In [51]:
df = pl.DataFrame(
    {
        "a": ['a', 'a', 'b', 'b'],
        "b": [5, 6, 7, 8],
    }
)
df

a,b
str,i64
"""a""",5
"""a""",6
"""b""",7
"""b""",8


In [54]:
df2 = (
    df
    .with_columns(
        change = pl.col('a').ne(pl.col('a').shift())
    )
)
df2

a,b,change
str,i64,bool
"""a""",5,
"""a""",6,False
"""b""",7,True
"""b""",8,False
