# Spark Playground

## Preparation

In [4]:
import math
import datetime

import pyspark.sql.functions as sqlfunctions

from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import StringType, BooleanType
from pyspark.sql.functions import udf
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD

In [5]:
CSV_FILE_PATH = '../data/yellow_sample_001.csv'

### Configure Spark

In [6]:
conf = (SparkConf().setMaster("local[*]").setAppName('pyspark'))
sc = SparkContext(conf=conf)
sql_context = SQLContext(sc)

### Read CSV

In [7]:
df = sql_context.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(CSV_FILE_PATH)

### Clean Data

In [8]:
df = df.filter((df.Start_Lon >= -80) & (df.Start_Lon <= -70) & (df.Start_Lat >= 40) & (df.Start_Lat <= 50))

def is_float(value):
  try:
    float(value)
    return True
  except ValueError:
    return False
is_float_udf = udf(is_float, BooleanType())
df = df.filter((is_float_udf(df.Start_Lon)) & (is_float_udf(df.Start_Lat)))

### Discretize Coordinates

In [25]:
discretized_df = df.withColumn("Discretized_Lon", sqlfunctions.round(df.Start_Lon, 2))
discretized_df = discretized_df.withColumn("Discretized_Lat", sqlfunctions.round(df.Start_Lat, 2))

discretized_df = discretized_df.cache()
#discretized_df.show()
discretized_df.count()

1156329

In [26]:
discretized_dropoffs_df = df.withColumn("Discretized_Lon", sqlfunctions.round(df.End_Lon, 2))
discretized_dropoffs_df = discretized_dropoffs_df.withColumn("Discretized_Lat", sqlfunctions.round(df.End_Lat, 2))

discretized_dropoffs_df = discretized_dropoffs_df.cache()
#discretized_dropoffs_df.show()
discretized_dropoffs_df.count()

1156329

## Accumulation of district pickups and dropoffs per hour

In [27]:
def extract_pickups(row):
    date = row.Trip_Pickup_DateTime
    return Row(dis_lat=row.Discretized_Lat,
               dis_lon=row.Discretized_Lon,
               hour=date.hour, 
               day = date.day,
               month = date.month,
               year = date.year)

pickup_df = discretized_df.map(extract_pickups).toDF()
grouped_pickup_df = pickup_df.groupby('hour', 'day', 'month', 'year', 'dis_lat', 'dis_lon') \
                               .count().withColumnRenamed("count", "pickup_count")
grouped_pickup_df.show()
grouped_pickup_df.count()


+----+---+-----+----+-------+-------+------------+
|hour|day|month|year|dis_lat|dis_lon|pickup_count|
+----+---+-----+----+-------+-------+------------+
|  20| 21|    1|2009|  40.71| -74.01|           2|
|   8| 30|    1|2009|  40.75| -73.98|           4|
|  15| 22|    1|2009|  40.79| -73.96|           2|
|  19|  8|    1|2009|  40.77| -73.97|           2|
|  14| 13|    1|2009|  40.75| -73.97|           2|
|  16| 17|    1|2009|  40.73| -73.99|           2|
|  14|  4|    1|2009|  40.73| -74.01|           1|
|  14|  1|    1|2009|  40.74| -73.99|           1|
|  10| 21|    1|2009|  40.78| -73.98|           1|
|  10|  6|    1|2009|  40.77| -73.95|           1|
|  12| 17|    1|2009|  40.78| -73.95|           3|
|  10| 26|    1|2009|  40.77| -73.96|           2|
|  11| 19|    1|2009|  40.74|  -74.0|           1|
|  21| 18|    1|2009|   40.8| -73.97|           1|
|  19| 23|    1|2009|  40.75| -73.98|           2|
|  19| 18|    1|2009|  40.72|  -74.0|           1|
|  10|  6|    1|2009|  40.77| -

845844

In [29]:
def extract_dropoffs(row):
    date = row.Trip_Dropoff_DateTime
    return Row(dis_lat=row.Discretized_Lat,
               dis_lon=row.Discretized_Lon,
               hour=date.hour, 
               day = date.day,
               month = date.month,
               year = date.year)

dropoff_df = discretized_dropoffs_df.map(extract_dropoffs).toDF()
grouped_dropoff_df = dropoff_df.groupby('hour', 'day', 'month', 'year', 'dis_lat', 'dis_lon') \
                               .count().withColumnRenamed("count", "dropoff_count")
grouped_dropoff_df.show()
grouped_dropoff_df.count()

+----+---+-----+----+-------+-------+-------------+
|hour|day|month|year|dis_lat|dis_lon|dropoff_count|
+----+---+-----+----+-------+-------+-------------+
|  23| 10|    1|2009|  40.78| -73.95|            4|
|   8| 30|    1|2009|  40.75| -73.98|            4|
|   0| 29|    1|2009|   40.7| -73.95|            1|
|  19| 16|    1|2009|  40.74| -73.99|            1|
|  14|  3|    1|2009|  40.72|  -74.0|            2|
|   0|  9|    1|2009|  40.73| -73.99|            1|
|   8| 29|    1|2009|  40.76| -73.97|            5|
|  19| 28|    1|2009|  40.75| -73.97|            1|
|   8| 20|    1|2009|   40.7| -74.01|            2|
|  11|  9|    1|2009|  40.75| -73.96|            1|
|  19| 17|    1|2009|  40.76| -73.98|            5|
|  18|  9|    1|2009|  40.74| -73.98|            2|
|  13| 25|    1|2009|  40.75| -73.99|            2|
|   7| 12|    1|2009|  40.76|  -74.0|            1|
|  16| 15|    1|2009|  40.76|  -74.0|            2|
|  16| 17|    1|2009|  40.73| -73.99|            1|
|   0| 10|  

877926

In [35]:
grouped_pickup_df.rdd.saveAsTextFile('../data/grouped_pickups_test_yellow_sample_001')
grouped_dropoff_df.rdd.saveAsTextFile('../data/grouped_dropoffs_test_yellow_sample_001')

## Regression

In [10]:
def extract_feature(row):
    date = row.Trip_Pickup_DateTime
    return Row(dis_lat=row.Discretized_Lat,
               dis_lon=row.Discretized_Lon,
               hour=date.hour, 
               day = date.day,
               month = date.month,
               year = date.year)

feature_df = discretized_df.map(extract_feature).toDF()
grouped_feature_df = feature_df.groupby('hour', 'day', 'month', 'year', 'dis_lat', 'dis_lon') \
                               .count().withColumnRenamed("count", "pickup_count")

In [11]:
def create_point(row):
    lat, lon = row.dis_lat, row.dis_lon
    day_of_week = datetime.date(row.year, row.month, row.day).weekday()
    
    return LabeledPoint(row.pickup_count, [lat, lon, row.hour, row.day, row.month, row.year, day_of_week])

points = grouped_feature_df.map(create_point)
points = points.cache()
(training_data, test_data) = points.randomSplit([0.7, 0.3])

In [12]:
def print_evaluation(test_data, model):
    predictions = model.predict(test_data.map(lambda x: x.features))
    labels_predictions = test_data.map(lambda lp: lp.label).zip(predictions)
    mse = labels_predictions.map(lambda vp: (vp[0] - vp[1]) * (vp[0] - vp[1])).sum() / float(test_data.count())
    rmse = math.sqrt(mse)
    
    print('Test Mean Squared Error = ' + str(mse))
    print('Test Root Mean Squared Error = ' + str(rmse))

### Linear regression

In [13]:
lr_model = LinearRegressionWithSGD.train(training_data, iterations=100, step=0.00000001)
print_evaluation(test_data, lr_model)

Test Mean Squared Error = 2.1456520209
Test Root Mean Squared Error = 1.46480443094


### Random Forrest Regression

In [17]:
knn_model = RandomForest.trainRegressor(training_data, categoricalFeaturesInfo={},
                                        numTrees=3, featureSubsetStrategy="auto",
                                        impurity='variance', maxDepth=4, maxBins=32)
print_evaluation(test_data, knn_model)

NameError: name 'training_data' is not defined

### *TODO* K-Nearest Neighbors regression