In [1]:
# Section must be included at the beginning of each new notebook. Remember to change the app name. 
# If you're using VirtualBox, change the below to '/home/user/spark-2.1.1-bin-hadoop2.7'
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('demo').getOrCreate()

In [2]:
# Load machine learning library
from pyspark.ml.regression import LinearRegression

In [3]:
df = spark.read.csv("data.csv",inferSchema=True,header=True)

In [4]:
df.show(10)

+--------+----------------+--------------------+----+---------+-----+------+-------+-------------+-------------+----------+--------+--------+
|Visitors| UniquePageviews|                Date|Time|DayOfWeek|Month|Season|Weekend|SchoolHoliday|PublicHoliday|Cruiseship|EventInt|EventExt|
+--------+----------------+--------------------+----+---------+-----+------+-------+-------------+-------------+----------+--------+--------+
|    4364|           910.0|2016-03-24 00:00:...|   1|      Thu|  Mar|Autumn|      0|            0|            0|         0|       0|       0|
|    8116|970.571428571429|2016-03-25 00:00:...|   2|      Fri|  Mar|Autumn|      0|            0|            1|         0|       0|       0|
|    9268|972.714285714286|2016-03-26 00:00:...|   3|      Sat|  Mar|Autumn|      1|            0|            0|         0|       0|       0|
|    8360|1009.28571428571|2016-03-27 00:00:...|   4|      Sun|  Mar|Autumn|      1|            0|            0|         0|       0|       0|
|    6

In [5]:
df.describe().show()

+-------+------------------+------------------+-----------------+---------+-----+------+-------------------+-------------------+--------------------+-------------------+--------------------+--------------------+
|summary|          Visitors|   UniquePageviews|             Time|DayOfWeek|Month|Season|            Weekend|      SchoolHoliday|       PublicHoliday|         Cruiseship|            EventInt|            EventExt|
+-------+------------------+------------------+-----------------+---------+-----+------+-------------------+-------------------+--------------------+-------------------+--------------------+--------------------+
|  count|               741|               741|              741|      741|  741|   741|                741|                741|                 741|                741|                 741|                 741|
|   mean| 4335.429149797571| 959.9820705610164|            371.0|     null| null|  null|0.28609986504723345|  0.252361673414305|0.033738191632928474|0.1

In [6]:
df.printSchema()

root
 |-- Visitors: integer (nullable = true)
 |-- UniquePageviews: double (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Time: integer (nullable = true)
 |-- DayOfWeek: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- Season: string (nullable = true)
 |-- Weekend: integer (nullable = true)
 |-- SchoolHoliday: integer (nullable = true)
 |-- PublicHoliday: integer (nullable = true)
 |-- Cruiseship: integer (nullable = true)
 |-- EventInt: integer (nullable = true)
 |-- EventExt: integer (nullable = true)



In [8]:
from pyspark.ml.feature import VectorAssembler
vector_assembler = VectorAssembler(inputCols = ['UniquePageviews', 'Weekend', 'SchoolHoliday', 'PublicHoliday', 'Cruiseship', 'EventInt', 'EventExt'], outputCol = 'features')
vector_output = vector_assembler.transform(df)
vector_output.printSchema()
vector_output.head(1)

root
 |-- Visitors: integer (nullable = true)
 |-- UniquePageviews: double (nullable = true)
 |-- Date: timestamp (nullable = true)
 |-- Time: integer (nullable = true)
 |-- DayOfWeek: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- Season: string (nullable = true)
 |-- Weekend: integer (nullable = true)
 |-- SchoolHoliday: integer (nullable = true)
 |-- PublicHoliday: integer (nullable = true)
 |-- Cruiseship: integer (nullable = true)
 |-- EventInt: integer (nullable = true)
 |-- EventExt: integer (nullable = true)
 |-- features: vector (nullable = true)



[Row(Visitors=4364, UniquePageviews=910.0, Date=datetime.datetime(2016, 3, 24, 0, 0), Time=1, DayOfWeek='Thu', Month='Mar', Season='Autumn', Weekend=0, SchoolHoliday=0, PublicHoliday=0, Cruiseship=0, EventInt=0, EventExt=0, features=SparseVector(7, {0: 910.0}))]

In [10]:
vector_output = vector_output.select(['features', 'Visitors'])
print(vector_output.head(1))
vector_output.show(3)

[Row(features=SparseVector(7, {0: 910.0}), Visitors=4364)]
+--------------------+--------+
|            features|Visitors|
+--------------------+--------+
|     (7,[0],[910.0])|    4364|
|(7,[0,3],[970.571...|    8116|
|(7,[0,1],[972.714...|    9268|
+--------------------+--------+
only showing top 3 rows



In [11]:
train_data,test_data = vector_output.randomSplit([0.7,0.3])
train_data.describe().show()
test_data.describe().show()

+-------+------------------+
|summary|          Visitors|
+-------+------------------+
|  count|               505|
|   mean| 4326.487128712872|
| stddev|1605.5404372260846|
|    min|              1742|
|    max|             10488|
+-------+------------------+

+-------+------------------+
|summary|          Visitors|
+-------+------------------+
|  count|               236|
|   mean| 4354.563559322034|
| stddev|1681.8945624941175|
|    min|              1793|
|    max|              9786|
+-------+------------------+



In [12]:
lr = LinearRegression(featuresCol='features', labelCol='Visitors')
lr_model = lr.fit(train_data)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept) + "\n")
training_summary = lr_model.summary
print("RMSE: " + str(training_summary.rootMeanSquaredError))
print("R2: " + str(training_summary.r2))

Coefficients: [1.5508237688515891,1678.7956823043933,948.2250467503707,1611.580140243226,1016.3187930137825,365.3362676879877,271.07443412730163]
Intercept: 1857.5846109211857

RMSE: 940.6778778763676
R2: 0.6560461248673443


In [13]:
train_data.describe().show()

+-------+------------------+
|summary|          Visitors|
+-------+------------------+
|  count|               505|
|   mean| 4326.487128712872|
| stddev|1605.5404372260846|
|    min|              1742|
|    max|             10488|
+-------+------------------+



In [14]:
test_results = lr_model.evaluate(test_data)
print("RMSE on test data: " + str(test_results.rootMeanSquaredError))
print("R2 on test data: " + str(test_results.r2))

RMSE on test data: 949.2495001522431
R2 on test data: 0.6801050663948449
