# Processing data files

Starting with a CSV file from NOAA:

1. Filter on geometry and save the resulting (much smaller) file.
2. Generate lines for daily vessel transits and generate a GeoDataFrame.
3. Save the results.

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, LineString, shape

#### 1. Filtering

Don't run this unless you absolutely need to.  It takes a bit.

In [2]:
"""
with open("AIS_2015_01_Zone10.csv") as f:
    with open("AIS_2015_01_SF_Bay.csv", "w") as t:
        t.write(f.readline())
        while True:
            line = f.readline()
            split_line = line.split(',')
            if (float(split_line[2]) >= 37.4) & (float(split_line[2]) <= 38.15) & (float(split_line[3]) >= -122.56) & (float(split_line[3]) <= -121.9):
                t.write(line)
"""

'\nwith open("AIS_2015_01_Zone10.csv") as f:\n    with open("AIS_2015_01_SF_Bay.csv", "w") as t:\n        t.write(f.readline())\n        while True:\n            line = f.readline()\n            split_line = line.split(\',\')\n            if (float(split_line[2]) >= 37.4) & (float(split_line[2]) <= 38.15) & (float(split_line[3]) >= -122.56) & (float(split_line[3]) <= -121.9):\n                t.write(line)\n'

In [3]:
bay_traffic = pd.read_csv('~/MIDS/W209/ais/data/AIS_2015_01_SF_Bay.csv')

In [4]:
# Parse dates
bay_traffic.BaseDateTime = pd.to_datetime(bay_traffic.BaseDateTime)

# Sort
bay_traffic.sort_values(by=['MMSI', 'BaseDateTime'], inplace=True, ignore_index=True)

# Remove thousands of useless near-zero speed points
bay_traffic = bay_traffic[bay_traffic.SOG >= 0.5]

# Capture the weekday.  TODO:  Time is in UTC.  Fix.
bay_traffic['Date'] = [x.date() for x in bay_traffic.BaseDateTime]
bay_traffic['Weekday'] = [x.weekday() for x in bay_traffic.BaseDateTime]

# Index on the individual vessel and the date.  TODO:  Again with the UTC.  Fix.
Date = [str(x.date()) for x in bay_traffic.BaseDateTime]
MMSI = [str(x) for x in bay_traffic.MMSI]
VoyageIndex = [i + '-' + j for j,i in zip(Date, MMSI)]
bay_traffic['VoyageIndex'] = VoyageIndex

# Encode points properly
points = [Point(xy) for xy in zip(bay_traffic.LON, bay_traffic.LAT)]
bay_traffic['geometry'] = points

bay_traffic.head()

Unnamed: 0,MMSI,BaseDateTime,LAT,LON,SOG,COG,Heading,VesselName,IMO,CallSign,VesselType,Status,Length,Width,Draft,Cargo,Date,Weekday,VoyageIndex,geometry
0,981261,2015-01-27 01:07:03,37.81452,-122.55984,34.4,126.7,511.0,,,,,under way using engine,,,,,2015-01-27,1,981261-2015-01-27,POINT (-122.55984 37.81452)
1,981261,2015-01-27 01:08:04,37.80865,-122.55,33.0,129.4,511.0,,,,,under way using engine,,,,,2015-01-27,1,981261-2015-01-27,POINT (-122.55 37.80865)
2,981261,2015-01-27 01:09:07,37.80309,-122.5406,32.0,129.0,511.0,,,,,under way using engine,,,,,2015-01-27,1,981261-2015-01-27,POINT (-122.5406 37.80309)
3,981261,2015-01-27 01:10:09,37.79632,-122.53245,31.6,136.0,511.0,,,,,under way using engine,,,,,2015-01-27,1,981261-2015-01-27,POINT (-122.53245 37.79632)
4,981261,2015-01-27 01:11:11,37.79527,-122.52351,31.0,57.2,511.0,,,,,under way using engine,,,,,2015-01-27,1,981261-2015-01-27,POINT (-122.52351 37.79527)


#### 2. Generate lines

In [6]:
# Make sure that we have enough points in each date-MMSI index
enough_points = list(np.array(bay_traffic.VoyageIndex.value_counts().index)[bay_traffic.VoyageIndex.value_counts() > 10])
retain_index = [x in enough_points for x in bay_traffic.VoyageIndex]
bay_traffic = bay_traffic[retain_index]

# Build the lines in a GeoDataFrame
bay_traffic_geo = bay_traffic.groupby(['VoyageIndex'])['geometry'].apply(lambda x: LineString(x.tolist()))
bay_traffic_geo = gpd.GeoDataFrame(bay_traffic_geo)
bay_traffic_geo.head()

Unnamed: 0_level_0,geometry
VoyageIndex,Unnamed: 1_level_1
209047000-2015-01-22,"LINESTRING (-122.55998 37.78507, -122.55550 37..."
209946000-2015-01-02,"LINESTRING (-122.41077 37.92217, -122.41115 37..."
209946000-2015-01-03,"LINESTRING (-122.36487 37.91588, -122.36448 37..."
210161000-2015-01-03,"LINESTRING (-122.31862 37.79689, -122.31870 37..."
210242000-2015-01-07,"LINESTRING (-122.55986 37.78521, -122.55390 37..."


In [7]:
# Encode some metadata
by_index = bay_traffic.groupby('VoyageIndex')
bay_traffic_geo['max_SOG'] = by_index.SOG.max()
bay_traffic_geo['LOA'] = by_index.Length.median()

bay_traffic_geo.head()

Unnamed: 0_level_0,geometry,max_SOG,LOA
VoyageIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
209047000-2015-01-22,"LINESTRING (-122.55998 37.78507, -122.55550 37...",13.4,160.87
209946000-2015-01-02,"LINESTRING (-122.41077 37.92217, -122.41115 37...",11.6,182.83
209946000-2015-01-03,"LINESTRING (-122.36487 37.91588, -122.36448 37...",14.7,182.83
210161000-2015-01-03,"LINESTRING (-122.31862 37.79689, -122.31870 37...",16.4,246.8
210242000-2015-01-07,"LINESTRING (-122.55986 37.78521, -122.55390 37...",17.5,246.87


#### 3. Save results

In [None]:
# bay_traffic_geo.to_csv('../project_flask/static/data/bay_traffic.csv')