In [1]:
# Section must be included at the beginning of each new notebook. Remember to change the app name.
# If you're using VirtualBox, change the below to '/home/user/spark-2.1.1-bin-hadoop2.7'
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('linear_regression_docs').getOrCreate()

# If you're getting an error with numpy, please type 'sudo pip3 install numpy --user' into the console.
# If you're getting an error with another package, type 'sudo pip3 install PACKAGENAME --user'. 
# Replace PACKAGENAME with the relevant package (such as pandas, etc).
from pyspark.ml.regression import LinearRegression

In [10]:
# Load model training data. Location of the data may be different.
df = spark.read.format("csv").load("Datasets/forestfires.csv", inferSchema=True, header=True)
#training2 = spark.read.csv("Datasets/forestfires.csv")

In [12]:
df.head()
df.printSchema()

root
 |-- X: integer (nullable = true)
 |-- Y: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- FFMC: double (nullable = true)
 |-- DMC: double (nullable = true)
 |-- DC: double (nullable = true)
 |-- ISI: double (nullable = true)
 |-- temp: double (nullable = true)
 |-- RH: integer (nullable = true)
 |-- wind: double (nullable = true)
 |-- rain: double (nullable = true)
 |-- area: double (nullable = true)



In [14]:
import pandas as pd
df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
X,517,4.669245647969052,2.313777828725767,1,9
Y,517,4.299806576402321,1.22990040298981,2,9
month,517,,,apr,sep
day,517,,,fri,wed
FFMC,517,90.6446808510636,5.520110848851271,18.7,96.2
DMC,517,110.87234042553195,64.04648224925424,1.1,291.3
DC,517,547.9400386847191,248.06619170584355,7.9,860.6
ISI,517,9.021663442940042,4.559477175216039,0.0,56.1
temp,517,18.88916827852998,5.806625349573504,2.2,33.3


In [20]:
from pyspark.ml.feature import VectorAssembler
# The input columns are the feature column names, and the output column is what you'd like the new column to be named. 
# Did not include month and day as they were categorical values
vector_assembler = VectorAssembler(inputCols = ['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain'], outputCol = 'features')

# Now that we've created the assembler variable, let's actually transform the data.
vector_output = vector_assembler.transform(df)

In [21]:
# Using print schema, you see that the features output column has been added. 
vector_output.printSchema()

# You can see that the features column is a DenseVector that combines the various features as expected.
vector_output.head(1)

root
 |-- X: integer (nullable = true)
 |-- Y: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- FFMC: double (nullable = true)
 |-- DMC: double (nullable = true)
 |-- DC: double (nullable = true)
 |-- ISI: double (nullable = true)
 |-- temp: double (nullable = true)
 |-- RH: integer (nullable = true)
 |-- wind: double (nullable = true)
 |-- rain: double (nullable = true)
 |-- area: double (nullable = true)
 |-- features: vector (nullable = true)



[Row(X=7, Y=5, month='mar', day='fri', FFMC=86.2, DMC=26.2, DC=94.3, ISI=5.1, temp=8.2, RH=51, wind=6.7, rain=0.0, area=0.0, features=DenseVector([7.0, 5.0, 86.2, 26.2, 94.3, 5.1, 8.2, 51.0, 6.7, 0.0]))]

In [22]:
# Because the features have been combined into one vector, we no longer need them. Below we select the features and label.
vector_output = vector_output.select(['features', 'area'])

# You can see that the dataframe now only contains two columns. 
print(vector_output.head(1))
vector_output.show(3)

[Row(features=DenseVector([7.0, 5.0, 86.2, 26.2, 94.3, 5.1, 8.2, 51.0, 6.7, 0.0]), area=0.0)]
+--------------------+----+
|            features|area|
+--------------------+----+
|[7.0,5.0,86.2,26....| 0.0|
|[7.0,4.0,90.6,35....| 0.0|
|[7.0,4.0,90.6,43....| 0.0|
+--------------------+----+
only showing top 3 rows

