In [None]:
# (1) Import the required Python dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
# (2) Load the Bike Sharing dataset (data/bike-sharing-data/day.csv) into a Pandas DataFrame
bike_sharing_raw_df = pd.read_csv('./data/bike-sharing-data/day.csv', 
                                 delimiter = ',')
bike_sharing_raw_df.head()

In [None]:
# (3.1) Raw Data Exploration - Plot normalized temperature (celsius) against count of total rental bikes
bike_sharing_raw_df.plot.scatter(x='temp', y='cnt')

In [None]:
# (3.2) Raw Data Exploration - Plot normalized feeling temperature (celsius) against count of total rental bikes
bike_sharing_raw_df.plot.scatter(x='atemp', y='cnt')

In [None]:
# (3.3) Raw Data Exploration - Plot normalized humidity against count of total rental bikes
bike_sharing_raw_df.plot.scatter(x='hum', y='cnt')

In [None]:
# (3.4) Raw Data Exploration - Plot normalized wind speed against count of total rental bikes
bike_sharing_raw_df.plot.scatter(x='windspeed', y='cnt')

In [None]:
# (4) Instantiate a Spark Context
conf = SparkConf().setMaster("local").setAppName("Univariate Linear Regression - Bike Sharing")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [None]:
# (5) Load the Bike Sharing dataset into a Spark DataFrame
bike_sharing_df = sqlContext.read.format('com.databricks.spark.csv').options(header = 'true', inferschema = 'true').load('./data/bike-sharing-data/day.csv')
bike_sharing_df.head(10)
bike_sharing_df.printSchema()

In [None]:
# (6) Calculate standard statistical descriptive analytics on the raw Bike Sharing Spark DataFrame
bike_sharing_df.describe().toPandas().transpose()

In [None]:
# (7) Explore a Spark DataFrame
bike_sharing_df.toPandas().plot.scatter(x='atemp', y='cnt')

In [None]:
# (8) Generate Input Feature Vectors from the Raw Spark DataFrame
univariate_feature_column = 'atemp'
univariate_label_column = 'cnt'
vector_assembler = VectorAssembler(inputCols = [univariate_feature_column], outputCol = 'features')
bike_sharing_features_df = vector_assembler.transform(bike_sharing_df).select(['features', univariate_label_column])
bike_sharing_features_df.head(10)

In [None]:
# (9) Split the Raw DataFrame into a Training DataFrame and a Test DataFrame
train_df, test_df = bike_sharing_features_df.randomSplit([0.75, 0.25], seed=12345)
train_df.count(), test_df.count()

In [None]:
# (10) Train a Linear Regression Model on the Training DataFrame
linear_regression = LinearRegression(featuresCol = 'features', labelCol = univariate_label_column)
linear_regression_model = linear_regression.fit(train_df)

In [None]:
# (11) Output Linear Regression Model Summary Statistics to evaluate the Training Model
print("Model Coefficients: " + str(linear_regression_model.coefficients))
print("Intercept: " + str(linear_regression_model.intercept))
training_summary = linear_regression_model.summary
print("RMSE: %f" % training_summary.rootMeanSquaredError)
print("R-SQUARED: %f" % training_summary.r2)
print("TRAINING DATASET DESCRIPTIVE SUMMARY: ")
train_df.describe().show()
print("TRAINING DATASET RESIDUALS: ")
training_summary.residuals.show()

In [None]:
# (12) Apply the Trained Linear Regression Model to the Test DataFrame to make predictions
test_linear_regression_predictions_df = linear_regression_model.transform(test_df)
print("TEST DATASET PREDICTIONS AGAINST ACTUAL LABEL: ")
test_linear_regression_predictions_df.select("prediction", univariate_label_column, "features").show(10)

In [None]:
# (13) Evaluate the performance of our Linear Regression Model on the Test DataFrame using a Regression Evaluator
linear_regression_evaluator_rmse = RegressionEvaluator(predictionCol = "prediction", labelCol = univariate_label_column, metricName = "rmse")
linear_regression_evaluator_r2 = RegressionEvaluator(predictionCol = "prediction", labelCol = univariate_label_column, metricName = "r2")
print("RMSE on Test Data = %g" % linear_regression_evaluator_rmse.evaluate(test_linear_regression_predictions_df))
print("R-SQUARED on Test Data = %g" % linear_regression_evaluator_r2.evaluate(test_linear_regression_predictions_df))

In [None]:
# (14) Alternatively we can use the Test Summary Results
test_summary = linear_regression_model.evaluate(test_df)
print("RMSE on Test Data = %g" % test_summary.rootMeanSquaredError)
print("R-SQUARED on Test Data = %g" % test_summary.r2)

In [None]:
# (15) Stop the Spark Context
sc.stop()