In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('myproj').getOrCreate()
data = spark.read.csv('/FileStore/tables/Bike_Share.csv',inferSchema=True,header=True)
data.printSchema()

In [2]:
data.describe('cnt').show()

In [3]:
#cnt = casual + registered
#season and dteday are coverd by yr, mnth, and weekday
#holiday and weekday are highly correlated with workingday
#instant is row index column
#atemp is highly correlated with temp
data = data.drop("casual", "registered", "dteday", "season", "holiday", "weekday", "instant", "atemp")
data.printSchema()

In [4]:
from pyspark.mllib.stat import Statistics
import pandas as pd
col_names = data.columns
features = data.rdd.map(lambda row: row[0:])
corr_mat=Statistics.corr(features, method="pearson")
corr_df = pd.DataFrame(corr_mat)
corr_df.index, corr_df.columns = col_names, col_names
print(corr_df.to_string())

In [5]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=['yr','mnth','hr','workingday','weathersit','temp','hum','windspeed'],outputCol='features')
output = assembler.transform(data)
final_data = output.select('features','cnt')

In [6]:
from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.0, labelCol='cnt')
model = glr.fit(final_data)
summary = model.summary
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("T Values: " + str(summary.tValues))
print("P Values: " + str(summary.pValues))
print("AIC: " + str(summary.aic))

In [7]:
from pyspark.ml.regression import GeneralizedLinearRegression
glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.0, labelCol='cnt', fitIntercept=False)
model = glr.fit(final_data)
summary = model.summary
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("T Values: " + str(summary.tValues))
print("P Values: " + str(summary.pValues))
print("AIC: " + str(summary.aic))

In [8]:
from pyspark.ml.evaluation import RegressionEvaluator
my_eval = RegressionEvaluator().setLabelCol("cnt")
results = model.transform(final_data)
results.describe('prediction').show()

In [9]:
RMSE = my_eval.evaluate(results)
RMSE