In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession 
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator 

from pyspark.ml.feature import VectorAssembler

In [2]:
spark = SparkSession.builder.appName('taxis_fare').getOrCreate()

In [3]:
from pyspark.ml.regression import LinearRegression
from pyspark.sql.types import FloatType
import pandas as pd

In [4]:
# Load training data
df = spark.read.format("csv").option("header", "true").load("data/train_2.csv")

In [5]:
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [6]:
df.dtypes

[('key', 'string'),
 ('fare_amount', 'string'),
 ('pickup_datetime', 'string'),
 ('pickup_longitude', 'string'),
 ('pickup_latitude', 'string'),
 ('dropoff_longitude', 'string'),
 ('dropoff_latitude', 'string'),
 ('passenger_count', 'string')]

In [7]:
df = df.select(df['fare_amount'].cast("float").alias('fare_amount'),
               df['pickup_longitude'].cast("float").alias('pickup_longitude'),
               df['pickup_latitude'].cast("float").alias('pickup_latitude'),
               df['dropoff_longitude'].cast("float").alias('dropoff_longitude'),
               df['dropoff_latitude'].cast("float").alias('dropoff_latitude'),
               df['passenger_count'].cast("float").alias('passenger_count'),)


In [8]:
df = df.selectExpr("fare_amount as label",'pickup_longitude','pickup_latitude',
                  'dropoff_longitude','dropoff_latitude','passenger_count')

In [9]:
df.show()

+-----+----------------+---------------+-----------------+----------------+---------------+
|label|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|passenger_count|
+-----+----------------+---------------+-----------------+----------------+---------------+
|  4.5|      -73.844315|      40.721317|        -73.84161|       40.712276|            1.0|
| 16.9|      -74.016045|      40.711304|        -73.97927|       40.782005|            1.0|
|  5.7|      -73.982735|       40.76127|        -73.99124|        40.75056|            2.0|
|  7.7|       -73.98713|      40.733143|        -73.99157|        40.75809|            1.0|
|  5.3|      -73.968094|       40.76801|        -73.95666|       40.783764|            1.0|
| 12.1|       -74.00096|       40.73163|        -73.97289|       40.758232|            1.0|
|  7.5|          -73.98|      40.751663|         -73.9738|       40.764843|            1.0|
| 16.5|        -73.9513|       40.77414|         -73.9901|        40.75105|     

In [11]:
#colum features
vecAssembler = VectorAssembler(inputCols=["pickup_longitude", "pickup_latitude",
                                          "dropoff_longitude", "dropoff_latitude",
                                          "passenger_count"], outputCol="features")
new_df = vecAssembler.transform(df)
new_df.count()

1048575

In [12]:
#Delete null rows
new_df = vecAssembler.setHandleInvalid("skip").transform(df)
new_df.show()

+-----+----------------+---------------+-----------------+----------------+---------------+--------------------+
|label|pickup_longitude|pickup_latitude|dropoff_longitude|dropoff_latitude|passenger_count|            features|
+-----+----------------+---------------+-----------------+----------------+---------------+--------------------+
|  4.5|      -73.844315|      40.721317|        -73.84161|       40.712276|            1.0|[-73.844314575195...|
| 16.9|      -74.016045|      40.711304|        -73.97927|       40.782005|            1.0|[-74.016044616699...|
|  5.7|      -73.982735|       40.76127|        -73.99124|        40.75056|            2.0|[-73.982734680175...|
|  7.7|       -73.98713|      40.733143|        -73.99157|        40.75809|            1.0|[-73.987129211425...|
|  5.3|      -73.968094|       40.76801|        -73.95666|       40.783764|            1.0|[-73.968093872070...|
| 12.1|       -74.00096|       40.73163|        -73.97289|       40.758232|            1.0|[-74.

In [13]:
new_df.count()

1048565

In [14]:
# Fit the model
lrModel = lr.fit(new_df.select('label','features'))

In [15]:
# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

Coefficients: [0.0,0.0,0.0,0.0,0.0]
Intercept: 11.345360319738068


In [16]:
#Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredErr)

numIterations: 1
objectiveHistory: [0.49999999999999967]
+-------------------+
|          residuals|
+-------------------+
| -6.845360319738068|
|  5.554639298792205|
| -5.645360510472932|
|-3.6453605104729316|
| -6.045360129003205|
| 0.7546400617316582|
|-3.8453603197380684|
|  5.154639680261932|
|-2.3453603197380684|
| -2.445360701207795|
| -6.045360129003205|
| -5.845360319738068|
|   -7.2453604151055|
| -4.345360319738068|
|-3.6453605104729316|
| -6.345360319738068|
| 1.1546396802619316|
| -6.045360129003205|
| -6.045360129003205|
| -7.345360319738068|
+-------------------+
only showing top 20 rows



AttributeError: 'LinearRegressionTrainingSummary' object has no attribute 'rootMeanSquaredErr'

In [None]:
spark.stop()