In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier


conf = SparkConf().setAppName("msds697_project")
sc = SparkContext(conf=conf)
sc.setLogLevel("OFF")
ss = SparkSession.builder.getOrCreate()

In [10]:
def toFloatSafe(v):
    try:
        return float(v)
    except ValueError:
        return v

In [51]:
nyc_header = "vendor_name,Trip_Pickup_DateTime,Trip_Dropoff_DateTime,Passenger_Count,Trip_Distance,Start_Lon,Start_Lat,Rate_Code,store_and_forward,End_Lon,End_Lat,Payment_Type,Fare_Amt,surcharge,mta_tax,Tip_Amt,Tolls_Amt,Total_Amt".split(",")
nyc_df = sc.textFile("/home/julia/Downloads/yellow_tripdata_2009-01.csv")\
            .map(lambda x:  x.split(","))\
            .filter(lambda x: (len(x) == len(nyc_header)) & (x != nyc_header))\
            .map(lambda x: [toFloatSafe(v) for v in x])\
            .toDF(nyc_header)
keep_cols = ["Trip_Pickup_DateTime", "Trip_Dropoff_DateTime", "Trip_Distance", "Start_Lon", "Start_Lat", "End_Lon", "End_Lat"]
nyc_df = nyc_df.select(keep_cols)

Convert strings to datetime

In [52]:
def toDatetime(df, col_name):
    df = df.withColumn(col_name + "_2", to_timestamp(nyc_df[col_name], 'yyyy-MM-dd HH:mm:ss'))
    df = df.drop(col_name).withColumnRenamed(col_name + "_2", col_name)
    return df

In [53]:
nyc_df = toDatetime(nyc_df, "Trip_Pickup_DateTime")
nyc_df = toDatetime(nyc_df, "Trip_Dropoff_DateTime")

Extract day of week, add "is_Weekend" flag when trip starts OR ends during the weekend

In [65]:
# Date formats
# https://docs.oracle.com/javase/7/docs/api/java/text/SimpleDateFormat.html

In [66]:
nyc_df = nyc_df.withColumn('dow_Dropoff', date_format('Trip_Dropoff_DateTime', 'u').cast(IntegerType()))\
.withColumn('strdow_Dropoff', date_format('Trip_Dropoff_DateTime', 'E'))\
.withColumn('dow_Pickup', date_format('Trip_Pickup_DateTime', 'u').cast(IntegerType()))\
.withColumn('strdow_Pickup', date_format('Trip_Pickup_DateTime', 'E'))\
.withColumn('hour_Pickup', date_format('Trip_Pickup_DateTime', 'H').cast(IntegerType()))\
.withColumn('hour_Dropoff', date_format('Trip_Dropoff_DateTime', 'H').cast(IntegerType()))

nyc_df = nyc_df.withColumn("is_Weekend", when((nyc_df.dow_Pickup >= 6) | (nyc_df.dow_Dropoff >= 6), 1).otherwise(0))

nyc_df.show(10)

+-------------+----------+---------+----------+---------+--------------------+---------------------+-----------+--------------+----------+-------------+----------+-----------+------------+
|Trip_Distance| Start_Lon|Start_Lat|   End_Lon|  End_Lat|Trip_Pickup_DateTime|Trip_Dropoff_DateTime|dow_Dropoff|strdow_Dropoff|dow_Pickup|strdow_Pickup|is_Weekend|hour_Pickup|hour_Dropoff|
+-------------+----------+---------+----------+---------+--------------------+---------------------+-----------+--------------+----------+-------------+----------+-----------+------------+
|         2.63|-73.991957|40.721567|-73.993803|40.695922| 2009-01-04 02:52:00|  2009-01-04 03:02:00|          7|           Sun|         7|          Sun|         1|          2|           3|
|         4.55|-73.982102| 40.73629| -73.95585| 40.76803| 2009-01-04 03:31:00|  2009-01-04 03:38:00|          7|           Sun|         7|          Sun|         1|          3|           3|
|        10.35|-74.002587|40.739748|-73.869983|40.77022

In [67]:
nyc_df.printSchema()

root
 |-- Trip_Distance: double (nullable = true)
 |-- Start_Lon: double (nullable = true)
 |-- Start_Lat: double (nullable = true)
 |-- End_Lon: double (nullable = true)
 |-- End_Lat: double (nullable = true)
 |-- Trip_Pickup_DateTime: timestamp (nullable = true)
 |-- Trip_Dropoff_DateTime: timestamp (nullable = true)
 |-- dow_Dropoff: integer (nullable = true)
 |-- strdow_Dropoff: string (nullable = true)
 |-- dow_Pickup: integer (nullable = true)
 |-- strdow_Pickup: string (nullable = true)
 |-- is_Weekend: integer (nullable = false)
 |-- hour_Pickup: integer (nullable = true)
 |-- hour_Dropoff: integer (nullable = true)

