# Linear Regression - Spark

## Import Python Libraries

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## Spark Setup and Libraries

In [None]:
# Set up the environment for using pyspark
import findspark

findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.linalg import Vectors

In [None]:
# Create Application Context
spark = SparkSession.builder.appName("Linear Regression Example").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("Error")

## Python to read dataset and exploration

In [None]:
# Read the dataset using Pandas
dataset = pd.read_csv("../datasets/Salary_Data.csv")

In [None]:
dataset.head(10)

In [None]:
dataset.shape

In [None]:
dataset.describe()

In [None]:
plt.scatter(dataset['YearsExperience'], dataset['Salary'])
plt.show()

## Spark Processing

In [None]:
# Create Spark Dataframe from Pandas Dataframe
df = spark.createDataFrame(dataset)

In [None]:
df.show()

## Create Features and Target

In [None]:
from pyspark.ml.feature import VectorAssembler

# Select the features from dataset and put in a Vector
vassemb = VectorAssembler(inputCols = ['YearsExperience'], outputCol = 'features')

# transform our original dataframe
ndf = vassemb.transform(df)
ndf = ndf.select(['features', 'Salary'])
ndf.show(3)

In [None]:
# Create training set and test set
# Seed is used by random generator to generate row numbers for the split
train, test = ndf.randomSplit([0.7,0.3], seed = 2345)

# print the size of training and test set
print("train: {} test: {}".format(train.count(), test.count()))

## Linear Regression

In [None]:
from pyspark.ml.regression import LinearRegression

In [None]:
# Create a Linear Regressor Instance
lr = LinearRegression(featuresCol = 'features', labelCol='Salary', maxIter=5, regParam=0.3)

In [None]:
# Train our regressor
lr_model = lr.fit(train)

In [None]:
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

## RMSE measure differences between predicted values and the actual values.

In [None]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("R Squared: %f" % trainingSummary.r2)

In [None]:
# For pandas dataframe set precision of values
pd.set_option('precision', 2)

# make predictions using our model with test set
lr_predictions = lr_model.transform(test)

# Display as pandas dataframe
lr_predictions.select("prediction","Salary","features").toPandas().head(10)

In [None]:
lr_predictions.show()

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

# evaluate our model
lr_evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="Salary",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

In [None]:
test_result = lr_model.evaluate(test)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)