In [None]:
# (1) Import the required Python dependencies
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
# (2) Instantiate a Spark Context
conf = SparkConf().setMaster("local").setAppName("Multivariate Linear Regression - Bike Sharing")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [None]:
# (3) Load the Bike Sharing dataset into a Spark DataFrame
bike_sharing_df = sqlContext.read.format('com.databricks.spark.csv').options(header = 'true', inferschema = 'true').load('./data/bike-sharing-data/day.csv')
bike_sharing_df.show(5)

In [None]:
# (4) Calculate the level of Correlation between the relevant Independent Variables and the Dependent Variable
independent_variables = ['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed']
dependent_variable = ['cnt']
bike_sharing_df = bike_sharing_df.select( independent_variables + dependent_variable ) 
for i in bike_sharing_df.columns:
        print( "Correlation to CNT for ", i, bike_sharing_df.stat.corr('cnt', i))

In [None]:
# (5) Generate Input Feature Vectors from the Raw Spark DataFrame
multivariate_feature_columns = ['season', 'yr', 'mnth', 'temp', 'atemp']
multivariate_label_column = 'cnt'
vector_assembler = VectorAssembler(inputCols = multivariate_feature_columns, outputCol = 'features')
bike_sharing_features_df = vector_assembler.transform(bike_sharing_df).select(['features', multivariate_label_column])
bike_sharing_features_df.head(10)

In [None]:
# (6) Split the Raw DataFrame into a Training DataFrame and a Test DataFrame
train_df, test_df = bike_sharing_features_df.randomSplit([0.75, 0.25], seed=12345)
train_df.count(), test_df.count()

In [None]:
# (7) Train a Multivariate Linear Regression Model on the Training DataFrame
linear_regression = LinearRegression(featuresCol = 'features', labelCol = multivariate_label_column)
linear_regression_model = linear_regression.fit(train_df)

In [None]:
# (8) Output Multivariate Linear Regression Model Summary Statistics to evaluate the Training Model
print("Model Coefficients: " + str(linear_regression_model.coefficients))
print("Intercept: " + str(linear_regression_model.intercept))
training_summary = linear_regression_model.summary
print("RMSE: %f" % training_summary.rootMeanSquaredError)
print("R-SQUARED: %f" % training_summary.r2)
print("TRAINING DATASET DESCRIPTIVE SUMMARY: ")
train_df.describe().show()
print("TRAINING DATASET RESIDUALS: ")
training_summary.residuals.show()

In [None]:
# (9) Apply the Trained Multivariate Linear Regression Model to the Test DataFrame to make predictions
test_linear_regression_predictions_df = linear_regression_model.transform(test_df)
print("TEST DATASET PREDICTIONS AGAINST ACTUAL LABEL: ")
test_linear_regression_predictions_df.select("prediction", multivariate_label_column, "features").show(10)

In [None]:
# (10) Evaluate the performance of our Linear Regression Model on the Test DataFrame
test_summary = linear_regression_model.evaluate(test_df)
print("RMSE on Test Data = %g" % test_summary.rootMeanSquaredError)
print("R-SQUARED on Test Data = %g" % test_summary.r2)

In [None]:
# (11) Stop the Spark Context
sc.stop()