In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("bike-sharing-demand").getOrCreate()

In [2]:
trainDF = spark.read.csv("./data/train.csv", header=True, inferSchema=True)
testDF = spark.read.csv("./data/test.csv", header=True, inferSchema=True)

In [3]:
trainDF.printSchema()

root
 |-- datetime: string (nullable = true)
 |-- season: integer (nullable = true)
 |-- holiday: integer (nullable = true)
 |-- workingday: integer (nullable = true)
 |-- weather: integer (nullable = true)
 |-- temp: double (nullable = true)
 |-- atemp: double (nullable = true)
 |-- humidity: integer (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- casual: integer (nullable = true)
 |-- registered: integer (nullable = true)
 |-- count: integer (nullable = true)



In [4]:
trainDF.count()

10886

In [5]:
trainDF = trainDF.na.drop()

In [6]:
trainDF.count()

10886

In [7]:
trainDF.show()

+-------------------+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+
|           datetime|season|holiday|workingday|weather| temp| atemp|humidity|windspeed|casual|registered|count|
+-------------------+------+-------+----------+-------+-----+------+--------+---------+------+----------+-----+
|2011-01-01 00:00:00|     1|      0|         0|      1| 9.84|14.395|      81|      0.0|     3|        13|   16|
|2011-01-01 01:00:00|     1|      0|         0|      1| 9.02|13.635|      80|      0.0|     8|        32|   40|
|2011-01-01 02:00:00|     1|      0|         0|      1| 9.02|13.635|      80|      0.0|     5|        27|   32|
|2011-01-01 03:00:00|     1|      0|         0|      1| 9.84|14.395|      75|      0.0|     3|        10|   13|
|2011-01-01 04:00:00|     1|      0|         0|      1| 9.84|14.395|      75|      0.0|     0|         1|    1|
|2011-01-01 05:00:00|     1|      0|         0|      2| 9.84| 12.88|      75|   6.0032|     0|         1

In [8]:
from pyspark.sql.functions import when, lit
trainDF = trainDF.withColumn('season_1', when(trainDF['season']==1, lit(1)).otherwise(lit(0))) \
                .withColumn('season_2', when(trainDF['season']==2, lit(1)).otherwise(lit(0))) \
                .withColumn('season_3', when(trainDF['season']==3, lit(1)).otherwise(lit(0))) \
                .withColumn('season_4', when(trainDF['season']==4, lit(1)).otherwise(lit(0))).drop(trainDF['season'])

In [9]:
trainDF.columns

['datetime',
 'holiday',
 'workingday',
 'weather',
 'temp',
 'atemp',
 'humidity',
 'windspeed',
 'casual',
 'registered',
 'count',
 'season_1',
 'season_2',
 'season_3',
 'season_4']

In [10]:
trainDF = trainDF.withColumn('weather_1', when(trainDF['weather']==1, lit(1)).otherwise(lit(0))) \
                .withColumn('weather_2', when(trainDF['weather']==2, lit(1)).otherwise(lit(0))) \
                .withColumn('weather_3', when(trainDF['weather']==3, lit(1)).otherwise(lit(0))) \
                .withColumn('weather_4', when(trainDF['weather']==4, lit(1)).otherwise(lit(0))).drop(trainDF['weather'])

In [11]:
trainDF.columns

['datetime',
 'holiday',
 'workingday',
 'temp',
 'atemp',
 'humidity',
 'windspeed',
 'casual',
 'registered',
 'count',
 'season_1',
 'season_2',
 'season_3',
 'season_4',
 'weather_1',
 'weather_2',
 'weather_3',
 'weather_4']

In [12]:
from pyspark.sql.functions import split

trainDF = trainDF.withColumn('year', split(split(trainDF['datetime'], ' ')[0], '-')[0].cast('int'))
trainDF = trainDF.withColumn('month', split(split(trainDF['datetime'], ' ')[0], '-')[1].cast('int'))
trainDF = trainDF.withColumn('day', split(split(trainDF['datetime'], ' ')[0], '-')[2].cast('int'))
trainDF = trainDF.withColumn('hour', split(split(trainDF['datetime'], ' ')[1], ':')[0].cast('int'))

In [13]:
trainDF.select('year', 'month', 'day', 'hour').show()
trainDF = trainDF.drop('datetime')

+----+-----+---+----+
|year|month|day|hour|
+----+-----+---+----+
|2011|    1|  1|   0|
|2011|    1|  1|   1|
|2011|    1|  1|   2|
|2011|    1|  1|   3|
|2011|    1|  1|   4|
|2011|    1|  1|   5|
|2011|    1|  1|   6|
|2011|    1|  1|   7|
|2011|    1|  1|   8|
|2011|    1|  1|   9|
|2011|    1|  1|  10|
|2011|    1|  1|  11|
|2011|    1|  1|  12|
|2011|    1|  1|  13|
|2011|    1|  1|  14|
|2011|    1|  1|  15|
|2011|    1|  1|  16|
|2011|    1|  1|  17|
|2011|    1|  1|  18|
|2011|    1|  1|  19|
+----+-----+---+----+
only showing top 20 rows



In [14]:
trainDF.groupBy('month').sum('count').show()

+-----+----------+
|month|sum(count)|
+-----+----------+
|   12|    160160|
|    1|     79884|
|    6|    220733|
|    3|    133501|
|    5|    200147|
|    9|    212529|
|    4|    167402|
|    8|    213516|
|    7|    214617|
|   10|    207434|
|   11|    176440|
|    2|     99113|
+-----+----------+



In [15]:
## gathering all the features into one array using VectorAssembler
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['holiday', \
                                         'workingday', \
                                         'temp', \
                                         'atemp', \
                                         'humidity', \
                                         'windspeed', \
                                         'casual', \
                                         'registered', \
                                         'season_1', \
                                         'season_2', \
                                         'season_3', \
                                         'season_4', \
                                         'weather_1', \
                                         'weather_2', \
                                         'weather_3', \
                                         'weather_4', \
                                         'year', \
                                         'month', \
                                         'day', \
                                         'hour'], outputCol="features")

In [16]:
output = assembler.transform(trainDF)
output.show()

+-------+----------+-----+------+--------+---------+------+----------+-----+--------+--------+--------+--------+---------+---------+---------+---------+----+-----+---+----+--------------------+
|holiday|workingday| temp| atemp|humidity|windspeed|casual|registered|count|season_1|season_2|season_3|season_4|weather_1|weather_2|weather_3|weather_4|year|month|day|hour|            features|
+-------+----------+-----+------+--------+---------+------+----------+-----+--------+--------+--------+--------+---------+---------+---------+---------+----+-----+---+----+--------------------+
|      0|         0| 9.84|14.395|      81|      0.0|     3|        13|   16|       1|       0|       0|       0|        1|        0|        0|        0|2011|    1|  1|   0|(20,[2,3,4,6,7,8,...|
|      0|         0| 9.02|13.635|      80|      0.0|     8|        32|   40|       1|       0|       0|       0|        1|        0|        0|        0|2011|    1|  1|   1|(20,[2,3,4,6,7,8,...|
|      0|         0| 9.02|13.6

In [17]:
final_df = output.select('features', 'count')

In [18]:
final_df.show()

+--------------------+-----+
|            features|count|
+--------------------+-----+
|(20,[2,3,4,6,7,8,...|   16|
|(20,[2,3,4,6,7,8,...|   40|
|(20,[2,3,4,6,7,8,...|   32|
|(20,[2,3,4,6,7,8,...|   13|
|(20,[2,3,4,7,8,12...|    1|
|(20,[2,3,4,5,7,8,...|    1|
|(20,[2,3,4,6,8,12...|    2|
|(20,[2,3,4,6,7,8,...|    3|
|(20,[2,3,4,6,7,8,...|    8|
|(20,[2,3,4,6,7,8,...|   14|
|(20,[2,3,4,5,6,7,...|   36|
|(20,[2,3,4,5,6,7,...|   56|
|(20,[2,3,4,5,6,7,...|   84|
|(20,[2,3,4,5,6,7,...|   94|
|(20,[2,3,4,5,6,7,...|  106|
|(20,[2,3,4,5,6,7,...|  110|
|(20,[2,3,4,5,6,7,...|   93|
|(20,[2,3,4,5,6,7,...|   67|
|(20,[2,3,4,5,6,7,...|   35|
|(20,[2,3,4,5,6,7,...|   37|
+--------------------+-----+
only showing top 20 rows



In [19]:
from pyspark.ml.regression import LinearRegression
train_data, test_data = final_df.randomSplit([0.80, 0.20])
regressor = LinearRegression(featuresCol='features', labelCol='count')
regressor = regressor.fit(train_data)

In [20]:
regressor.coefficients

DenseVector([0.0, -0.0, 0.0, -0.0, -0.0, -0.0, 1.0, 1.0, 0.0002, 0.0001, -0.0001, -0.0002, -0.0, 0.0, 0.0, 0.0001, -0.0, 0.0, -0.0, -0.0])

In [21]:
regressor.intercept

0.006838714531440304

In [22]:
pred_results = regressor.evaluate(test_data)

In [32]:
pred_results.predictions.show()

+--------------------+-----+------------------+
|            features|count|        prediction|
+--------------------+-----+------------------+
|(20,[0,2,3,4,5,6,...|  159|158.99998931870277|
|(20,[0,2,3,4,5,6,...|   97| 97.00009236053812|
|(20,[0,2,3,4,5,6,...|   51| 50.99997458118856|
|(20,[0,2,3,4,5,7,...|   13|13.000037975123519|
|(20,[0,2,3,4,6,7,...|   68| 68.00001546121321|
|(20,[0,2,3,4,6,7,...|    4| 4.000077397148832|
|(20,[0,2,3,4,6,7,...|   19|18.999993386642714|
|(20,[0,2,3,4,6,7,...|  408| 407.9999176762049|
|(20,[0,2,3,4,6,7,...|  104|104.00007347954352|
|(20,[0,2,3,4,6,7,...|   21| 20.99997845013404|
|(20,[0,2,3,4,6,7,...|   34| 33.99999093738806|
|(20,[0,2,3,4,6,7,...|  365|364.99998513124837|
|(20,[1,2,3,4,5,6,...|   14|14.000009320582507|
|(20,[1,2,3,4,5,6,...|    7| 6.999948807338305|
|(20,[1,2,3,4,5,6,...|   25| 24.99994971844662|
|(20,[1,2,3,4,5,6,...|   23| 23.00001765689744|
|(20,[1,2,3,4,5,6,...|   23|23.000007876221044|
|(20,[1,2,3,4,5,6,...|   17|17.000028042

In [24]:
## Root Mean Square Error and Mean Absolute Error
pred_results.meanAbsoluteError

3.8017385166379496e-05

In [25]:
pred_results.meanSquaredError

2.1049854215815446e-09