# Spark Playground

## Preparation

In [1]:
import math
import datetime

import pyspark.sql.functions as sqlfunctions

from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SQLContext, Row
from pyspark.sql.types import StringType, BooleanType
from pyspark.sql.functions import udf
from pyspark.mllib.tree import RandomForest
from pyspark.mllib.regression import LabeledPoint, LinearRegressionWithSGD

In [2]:
CSV_FILE_PATH = '../data/yellow_sample_001.csv'

### Configure Spark

In [3]:
conf = (SparkConf().setMaster("local[*]").setAppName('pyspark'))
sc = SparkContext(conf=conf)
sql_context = SQLContext(sc)

### Read CSV

In [4]:
df = sql_context.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(CSV_FILE_PATH)

### Clean Data

In [5]:
df = df.filter((df.Start_Lon >= -80) & (df.Start_Lon <= -70) & (df.Start_Lat >= 40) & (df.Start_Lat <= 50))

def is_float(value):
  try:
    float(value)
    return True
  except ValueError:
    return False
is_float_udf = udf(is_float, BooleanType())
df = df.filter((is_float_udf(df.Start_Lon)) & (is_float_udf(df.Start_Lat)))

### Discretize Coordinates

In [6]:
discretized_df = df.withColumn("Discretized_Lon", sqlfunctions.round(df.Start_Lon, 2))
discretized_df = discretized_df.withColumn("Discretized_Lat", sqlfunctions.round(df.Start_Lat, 2))

discretized_df = discretized_df.cache()
discretized_df.count()

1156329

## Regression

In [7]:
def extract_feature(row):
    date = row.Trip_Pickup_DateTime
    return Row(dis_lat=row.Discretized_Lat,
               dis_lon=row.Discretized_Lon,
               hour=date.hour, 
               day = date.day,
               month = date.month,
               year = date.year)

feature_df = discretized_df.map(extract_feature).toDF()
grouped_feature_df = feature_df.groupby('hour', 'day', 'month', 'year', 'dis_lat', 'dis_lon') \
                               .count().withColumnRenamed("count", "pickup_count")

In [8]:
def create_point(row):
    lat, lon = row.dis_lat, row.dis_lon
    day_of_week = datetime.date(row.year, row.month, row.day).weekday()
    
    return LabeledPoint(row.pickup_count, [lat, lon, row.hour, row.day, row.month, row.year, day_of_week])

points = grouped_feature_df.map(create_point)
points = points.cache()
(training_data, test_data) = points.randomSplit([0.7, 0.3])

In [9]:
def print_evaluation(test_data, model):
    predictions = model.predict(test_data.map(lambda x: x.features))
    labels_predictions = test_data.map(lambda lp: lp.label).zip(predictions)
    mse = labels_predictions.map(lambda vp: (vp[0] - vp[1]) * (vp[0] - vp[1])).sum() / float(test_data.count())
    rmse = math.sqrt(mse)
    
    print('Test Mean Squared Error = ' + str(mse))
    print('Test Root Mean Squared Error = ' + str(rmse))

### Linear regression

In [10]:
lr_model = LinearRegressionWithSGD.train(training_data, iterations=100, step=0.00000001)
print_evaluation(test_data, lr_model)

Test Mean Squared Error = 2.14955164422
Test Root Mean Squared Error = 1.46613493384


### Random Forrest Regression

In [11]:
knn_model = RandomForest.trainRegressor(training_data, categoricalFeaturesInfo={},
                                        numTrees=3, featureSubsetStrategy="auto",
                                        impurity='variance', maxDepth=4, maxBins=32)
print_evaluation(test_data, knn_model)

Test Mean Squared Error = 0.487229182888
Test Root Mean Squared Error = 0.698018039085


### *TODO* K-Nearest Neighbors regression