In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql.window import Window

conf = SparkConf().setAppName("msds697_project")
sc = SparkContext(conf=conf)
sc.setLogLevel("OFF")
ss = SparkSession.builder.getOrCreate()

In [2]:
schema = StructType([StructField("lat", DoubleType(),True),
 StructField("lon", DoubleType(),True),
 StructField("p", IntegerType(),True),
                     StructField("timestamp", IntegerType(),True)])

In [3]:
%%time 
filename = "new_abgibo.txt"
sf_df = sc.textFile(filename).map(lambda x: x.split())\
.map(lambda x: (float(x[0]), float(x[1]), int(x[2]), int(x[3]))).toDF(schema).orderBy("timestamp", ascending=True)

CPU times: user 236 ms, sys: 12 ms, total: 248 ms
Wall time: 2.9 s


In [4]:
%%time 
w = Window.orderBy("timestamp")
sf_df = sf_df.withColumn("prev_p", lag('p',1).over(w))
w = Window.orderBy(sf_df.timestamp.desc())
sf_df = sf_df.withColumn("next_p", lag('p',1).over(w))
pickup_cond = ( (sf_df.p==1) & (sf_df.prev_p.isNull()) ) | ( (sf_df.p==1) & (sf_df.prev_p == 0) )
dropoff_cond = ( (sf_df.p==1) & (sf_df.next_p.isNull()) ) | ( (sf_df.p==1) & (sf_df.next_p == 0) ) 

sf_df = sf_df.withColumn("is_pickup", when(pickup_cond, 1).otherwise(0))
sf_df = sf_df.withColumn("is_dropoff", when(dropoff_cond, 1).otherwise(0))

# exclusive or
sf_df = sf_df.filter(sf_df.is_pickup != sf_df.is_dropoff )
 
assert sf_df.filter(sf_df.is_pickup==1).count() == sf_df.filter(sf_df.is_dropoff==1).count()

w = Window.orderBy(sf_df.timestamp.desc())
sf_df = sf_df.withColumn("dropoff_lat", lag('lat', 1).over(w))
sf_df = sf_df.withColumn("dropoff_lon", lag('lon', 1).over(w))

sf_df = sf_df.filter(sf_df.is_pickup == 1)


CPU times: user 76 ms, sys: 24 ms, total: 100 ms
Wall time: 9.78 s


In [5]:
%%time
sf_df.write.csv(filename.replace(".txt", "") + "_pyspark.csv")

CPU times: user 24 ms, sys: 4 ms, total: 28 ms
Wall time: 2.21 s
