In [1]:
#Run this cell to copy data to /shared-data folder - only need to run for the first time
!mkdir -p /shared-data/datasets/AISdata
!cp datasets/aisdk_20181102.csv /shared-data/datasets/AISdata

In [2]:
import sys
sys.path.insert(0, '/usr/local/spark/jars/samples.zip')
sys.path.insert(0, '/usr/local/spark/jars/xgboost4j-spark_3.0-1.0.0-0.2.0.jar')
sys.path.insert(0, '/usr/local/spark/jars/xgboost4j-1.0.0-0.2.0.jar')
import warnings
warnings.filterwarnings('ignore')

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql import SparkSession
from pyspark.sql.types import FloatType, IntegerType, StructField, StructType
from time import time

spark = SparkSession.builder \
    .config('spark.executor.cores','1')\
    .config('driver-memory', '10G')\
    .getOrCreate()

In [4]:
spark

In [6]:
df = spark.read.option("header",True) \
     .csv('/shared-data/datasets/AISdata/aisdk_20181102.csv')

In [7]:
df.show(20)

+------------+--------------+---------+---------+---------+--------------------+----+----+-----+-------+-------+--------+------------------+---------+----------+-----+------+------------------------------+-------+-----------+----+----------------+----+----+----+----+
| # Timestamp|Type of mobile|     MMSI| Latitude|Longitude| Navigational status| ROT| SOG|  COG|Heading|    IMO|Callsign|              Name|Ship type|Cargo type|Width|Length|Type of position fixing device|Draught|Destination| ETA|Data source type|   A|   B|   C|   D|
+------------+--------------+---------+---------+---------+--------------------+----+----+-----+-------+-------+--------+------------------+---------+----------+-----+------+------------------------------+-------+-----------+----+----------------+----+----+----+----+
|2/11/18 0:00|       Class A|219001182| 55.82256|10.063892|Under way using e...|   0|   0|  272|    272|Unknown|    null|              null|Undefined|      null| null|  null|                     U

In [8]:
MMSI_list = ['212963000','219000751','219015591','305837000','311646000']
df=df.filter(df.MMSI.isin(MMSI_list)).withColumnRenamed('# Timestamp','Timestamp')

In [9]:
df_plot=df.filter(df.MMSI.isin(MMSI_list[0])).select('Latitude','Longitude')
df_plot.show(10)

+---------+---------+
| Latitude|Longitude|
+---------+---------+
| 58.27342|  10.9595|
|58.273038|10.959695|
|58.272593|10.959918|
|58.272277|10.960082|
|58.271895|10.960285|
| 58.27145|10.960507|
|58.271133| 10.96067|
| 58.27069| 10.96089|
|58.270312|10.961085|
|58.269995|10.961245|
+---------+---------+
only showing top 10 rows



In [10]:
df_simplified=df[['Timestamp','MMSI','Latitude','Longitude']]

In [11]:
df_simplified.show(20)

+------------+---------+---------+---------+
|   Timestamp|     MMSI| Latitude|Longitude|
+------------+---------+---------+---------+
|2/11/18 0:00|212963000| 58.27342|  10.9595|
|2/11/18 0:00|311646000| 58.04611|10.192865|
|2/11/18 0:00|219000751|55.248292|14.836152|
|2/11/18 0:00|305837000|54.597698|12.253178|
|2/11/18 0:00|305837000|54.597563|12.253113|
|2/11/18 0:00|305837000| 54.59743|12.253045|
|2/11/18 0:00|212963000|58.273038|10.959695|
|2/11/18 0:00|311646000|58.045953|10.192062|
|2/11/18 0:00|305837000| 54.59723|12.252943|
|2/11/18 0:00|219015591|57.213833|   9.6867|
|2/11/18 0:00|305837000|54.597163| 12.25291|
|2/11/18 0:00|219000751|55.248307| 14.83615|
|2/11/18 0:00|305837000|54.597028|12.252842|
|2/11/18 0:00|212963000|58.272593|10.959918|
|2/11/18 0:00|305837000|54.596895|12.252775|
|2/11/18 0:00|219000751|55.248307| 14.83615|
|2/11/18 0:00|305837000|54.596762| 12.25271|
|2/11/18 0:00|305837000|54.596627|12.252645|
|2/11/18 0:00|212963000|58.272277|10.960082|
|2/11/18 0

In [12]:
df_simplified.write.mode('overwrite').option("header", "true").csv('/shared-data/datasets/AISdata/spark_results.csv')

In [13]:
spark.stop()