In [1]:
# Section must be included at the beginning of each new notebook. Remember to change the app name.
# If you're using VirtualBox, change the below to '/home/user/spark-2.1.1-bin-hadoop2.7'
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('linear_regression_docs').getOrCreate()

# If you're getting an error with numpy, please type 'sudo pip3 install numpy --user' into the console.
# If you're getting an error with another package, type 'sudo pip3 install PACKAGENAME --user'. 
# Replace PACKAGENAME with the relevant package (such as pandas, etc).
from pyspark.ml.regression import LinearRegression

In [10]:
# Load model training data. Location of the data may be different.
df = spark.read.format("csv").load("Datasets/forestfires.csv", inferSchema=True, header=True)
#training2 = spark.read.csv("Datasets/forestfires.csv")

In [12]:
df.head()
df.printSchema()

root
 |-- X: integer (nullable = true)
 |-- Y: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- FFMC: double (nullable = true)
 |-- DMC: double (nullable = true)
 |-- DC: double (nullable = true)
 |-- ISI: double (nullable = true)
 |-- temp: double (nullable = true)
 |-- RH: integer (nullable = true)
 |-- wind: double (nullable = true)
 |-- rain: double (nullable = true)
 |-- area: double (nullable = true)



In [39]:
import pandas as pd
from pyspark.ml.feature import StringIndexer

#if you run StringIndexer multiple times for same input and output column, gives an error
#this is because you already have the output column in the data
#indexer = StringIndexer(inputCol="month", outputCol="monthIndex")
#df = indexer.fit(df).transform(df)
#indexerDay = StringIndexer(inputCol='day', outputCol='dayIndex')
#df = indexerDay.fit(df).transform(df)
df.show()
df.describe().toPandas().transpose()

+---+---+-----+---+----+-----+-----+----+----+---+----+----+----+----------+--------+
|  X|  Y|month|day|FFMC|  DMC|   DC| ISI|temp| RH|wind|rain|area|monthIndex|dayIndex|
+---+---+-----+---+----+-----+-----+----+----+---+----+----+----+----------+--------+
|  7|  5|  mar|fri|86.2| 26.2| 94.3| 5.1| 8.2| 51| 6.7| 0.0| 0.0|       2.0|     1.0|
|  7|  4|  oct|tue|90.6| 35.4|669.1| 6.7|18.0| 33| 0.9| 0.0| 0.0|       6.0|     4.0|
|  7|  4|  oct|sat|90.6| 43.7|686.9| 6.7|14.6| 33| 1.3| 0.0| 0.0|       6.0|     2.0|
|  8|  6|  mar|fri|91.7| 33.3| 77.5| 9.0| 8.3| 97| 4.0| 0.2| 0.0|       2.0|     1.0|
|  8|  6|  mar|sun|89.3| 51.3|102.2| 9.6|11.4| 99| 1.8| 0.0| 0.0|       2.0|     0.0|
|  8|  6|  aug|sun|92.3| 85.3|488.0|14.7|22.2| 29| 5.4| 0.0| 0.0|       0.0|     0.0|
|  8|  6|  aug|mon|92.3| 88.9|495.6| 8.5|24.1| 27| 3.1| 0.0| 0.0|       0.0|     3.0|
|  8|  6|  aug|mon|91.5|145.4|608.2|10.7| 8.0| 86| 2.2| 0.0| 0.0|       0.0|     3.0|
|  8|  6|  sep|tue|91.0|129.5|692.6| 7.0|13.1| 63| 5.4

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
X,517,4.669245647969052,2.313777828725767,1,9
Y,517,4.299806576402321,1.22990040298981,2,9
month,517,,,apr,sep
day,517,,,fri,wed
FFMC,517,90.6446808510636,5.520110848851271,18.7,96.2
DMC,517,110.87234042553195,64.04648224925424,1.1,291.3
DC,517,547.9400386847191,248.06619170584355,7.9,860.6
ISI,517,9.021663442940042,4.559477175216039,0.0,56.1
temp,517,18.88916827852998,5.806625349573504,2.2,33.3


In [37]:
from pyspark.ml.feature import VectorAssembler
# The input columns are the feature column names, and the output column is what you'd like the new column to be named. 
# Did not include month and day as they were categorical values
vector_assembler = VectorAssembler(inputCols = ['X', 'Y', 'FFMC', 'monthIndex', 'dayIndex','DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain'], outputCol = 'features')

# Now that we've created the assembler variable, let's actually transform the data.
vector_output = vector_assembler.transform(df)

In [21]:
# Using print schema, you see that the features output column has been added. 
vector_output.printSchema()

# You can see that the features column is a DenseVector that combines the various features as expected.
vector_output.head(1)

root
 |-- X: integer (nullable = true)
 |-- Y: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- FFMC: double (nullable = true)
 |-- DMC: double (nullable = true)
 |-- DC: double (nullable = true)
 |-- ISI: double (nullable = true)
 |-- temp: double (nullable = true)
 |-- RH: integer (nullable = true)
 |-- wind: double (nullable = true)
 |-- rain: double (nullable = true)
 |-- area: double (nullable = true)
 |-- features: vector (nullable = true)



[Row(X=7, Y=5, month='mar', day='fri', FFMC=86.2, DMC=26.2, DC=94.3, ISI=5.1, temp=8.2, RH=51, wind=6.7, rain=0.0, area=0.0, features=DenseVector([7.0, 5.0, 86.2, 26.2, 94.3, 5.1, 8.2, 51.0, 6.7, 0.0]))]

In [22]:
# Because the features have been combined into one vector, we no longer need them. Below we select the features and label.
vector_output = vector_output.select(['features', 'area'])

# You can see that the dataframe now only contains two columns. 
print(vector_output.head(1))
vector_output.show(3)

[Row(features=DenseVector([7.0, 5.0, 86.2, 26.2, 94.3, 5.1, 8.2, 51.0, 6.7, 0.0]), area=0.0)]
+--------------------+----+
|            features|area|
+--------------------+----+
|[7.0,5.0,86.2,26....| 0.0|
|[7.0,4.0,90.6,35....| 0.0|
|[7.0,4.0,90.6,43....| 0.0|
+--------------------+----+
only showing top 3 rows



In [23]:
# Let's do a randomised 70/30 split. Remember, you should explain why you chose a particular split. 
train_data,test_data = vector_output.randomSplit([0.7,0.3])

# Let's see our training data.
train_data.describe().show()

# And our testing data.
test_data.describe().show()

+-------+------------------+
|summary|              area|
+-------+------------------+
|  count|               354|
|   mean|15.327259887005646|
| stddev|  76.0050579955863|
|    min|               0.0|
|    max|           1090.84|
+-------+------------------+

+-------+------------------+
|summary|              area|
+-------+------------------+
|  count|               163|
|   mean| 7.461349693251531|
| stddev|16.622102068614215|
|    min|               0.0|
|    max|            103.39|
+-------+------------------+



In [24]:
# Importing the Linear Regression package.
from pyspark.ml.regression import LinearRegression

# Instantiate the instance.
lr = LinearRegression(featuresCol='features', labelCol='area')

# Fit the training data.
lr_model = lr.fit(train_data)

# Print the coefficients for each X value ie. each feature
print("Coefficients: " + str(lr_model.coefficients))

# Print the intercept = mean value of Y when X is 0.
print("Intercept: " + str(lr_model.intercept) + "\n")

# Summarise the model and print out some evaluation metrics.
training_summary = lr_model.summary

# Print RMSE. Measures the difference between predicted and actual value.
print("RMSE: " + str(training_summary.rootMeanSquaredError))

# Print R2.
print("R2: " + str(training_summary.r2))

Coefficients: [2.6196028472327573,0.6254506258618394,-0.061398897949538414,0.12246929120290777,-0.006199849155893396,-0.7991807486428866,1.0324189342330126,-0.30373626639910395,2.5254319182384735,-24.203787267077914]
Intercept: -13.233999147964859

RMSE: 74.78364410494561
R2: 0.029139537874413923


In [40]:
# Let's evaluate the model against the test data.
test_results = lr_model.evaluate(test_data)

# And print the RMSE/R2. As expected, our RMSE and R2 are slightly worse when applying the testing set.
print("RMSE on test data: " + str(test_results.rootMeanSquaredError))
print("R2 on test data: " + str(test_results.r2))

RMSE on test data: 23.855160002793408
R2 on test data: -1.0723607962590713


In [None]:
#my RMSE on test data is better than on training data??