## Machine Learning Models
Import libraries

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('cloudanum').getOrCreate()

## Import Data

In [2]:
import pandas as pd
iris= spark.createDataFrame(pd.read_csv("https://storage.googleapis.com/neurals/data/iris.csv",header='infer'))
dataframe = iris.drop("species")
dataframe.show()


+-----------------+----------------+-----------------+----------------+
|sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|
+-----------------+----------------+-----------------+----------------+
|              5.1|             3.5|              1.4|             0.2|
|              4.9|             3.0|              1.4|             0.2|
|              4.7|             3.2|              1.3|             0.2|
|              4.6|             3.1|              1.5|             0.2|
|              5.0|             3.6|              1.4|             0.2|
|              5.4|             3.9|              1.7|             0.4|
|              4.6|             3.4|              1.4|             0.3|
|              5.0|             3.4|              1.5|             0.2|
|              4.4|             2.9|              1.4|             0.2|
|              4.9|             3.1|              1.5|             0.1|
|              5.4|             3.7|              1.5|          

## Vector Assembler

In [3]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler



assembler = VectorAssembler(
    inputCols=["sepal length (cm)", "sepal width (cm)", "petal length (cm)"],
    outputCol="features")

output = assembler.transform(dataframe)
print(" Assembled to vector column 'features'")

dataframe = output.drop("sepal length (cm)", "sepal width (cm)", "petal length (cm)")
dataframe = dataframe.withColumnRenamed("petal width (cm)","label")
dataframe.show()

 Assembled to vector column 'features'
+-----+-------------+
|label|     features|
+-----+-------------+
|  0.2|[5.1,3.5,1.4]|
|  0.2|[4.9,3.0,1.4]|
|  0.2|[4.7,3.2,1.3]|
|  0.2|[4.6,3.1,1.5]|
|  0.2|[5.0,3.6,1.4]|
|  0.4|[5.4,3.9,1.7]|
|  0.3|[4.6,3.4,1.4]|
|  0.2|[5.0,3.4,1.5]|
|  0.2|[4.4,2.9,1.4]|
|  0.1|[4.9,3.1,1.5]|
|  0.2|[5.4,3.7,1.5]|
|  0.2|[4.8,3.4,1.6]|
|  0.1|[4.8,3.0,1.4]|
|  0.1|[4.3,3.0,1.1]|
|  0.2|[5.8,4.0,1.2]|
|  0.4|[5.7,4.4,1.5]|
|  0.4|[5.4,3.9,1.3]|
|  0.3|[5.1,3.5,1.4]|
|  0.3|[5.7,3.8,1.7]|
|  0.3|[5.1,3.8,1.5]|
+-----+-------------+
only showing top 20 rows



## Clustering

In [None]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Loads data.
dataset = dataframe

# Trains a k-means model.
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)

# Make predictions
predictions = model.transform(dataset)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

## Decision Tree Regression

In [11]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

# Load the data stored in LIBSVM format as a DataFrame.
data = dataframe

# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a DecisionTree model.
dt = DecisionTreeRegressor(featuresCol="indexedFeatures")

# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, dt])

# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

treeModel = model.stages[1]
# summary only
print(treeModel)

+-------------------+-----+-------------+
|         prediction|label|     features|
+-------------------+-----+-------------+
|0.24400000000000002|  0.1|[4.8,3.0,1.4]|
|0.24400000000000002|  0.1|[4.9,3.1,1.5]|
|0.24400000000000002|  0.2|[5.0,3.4,1.5]|
|0.24400000000000002|  0.1|[4.9,3.1,1.5]|
|0.24400000000000002|  0.2|[4.6,3.6,1.0]|
+-------------------+-----+-------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 0.235771
DecisionTreeRegressionModel: uid=DecisionTreeRegressor_3fb83bb97ab3, depth=5, numNodes=47, numFeatures=3


## Random Forest Regression

In [12]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

# Load and parse the data file, converting it to a DataFrame.
data = dataframe

# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="indexedFeatures")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])

# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[1]
print(rfModel)  # summary only

+-------------------+-----+-------------+
|         prediction|label|     features|
+-------------------+-----+-------------+
|0.23551936213081723|  0.1|[4.3,3.0,1.1]|
|0.26036261609907124|  0.1|[4.9,3.1,1.5]|
|0.21333880657526177|  0.2|[4.6,3.1,1.5]|
|0.22635269546415066|  0.2|[4.7,3.2,1.3]|
|0.24252675751321262|  0.3|[5.1,3.5,1.4]|
+-------------------+-----+-------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 0.248977
RandomForestRegressionModel: uid=RandomForestRegressor_be59b55fbced, numTrees=20, numFeatures=3


## Gradient-boosted tree regression

In [13]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

# Load and parse the data file, converting it to a DataFrame.
data = dataframe

# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a GBT model.
gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=10)

# Chain indexer and GBT in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, gbt])

# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

gbtModel = model.stages[1]
print(gbtModel)  # summary only

+-------------------+-----+-------------+
|         prediction|label|     features|
+-------------------+-----+-------------+
| 0.2039719805861526|  0.2|[4.4,2.9,1.4]|
| 0.1039719805861526|  0.2|[4.6,3.1,1.5]|
| 0.2574249658997069|  0.2|[5.1,3.5,1.4]|
|0.29861474172351005|  0.2|[5.4,3.7,1.5]|
| 0.3498147417235101|  0.4|[5.4,3.9,1.7]|
+-------------------+-----+-------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 0.208903
GBTRegressionModel: uid=GBTRegressor_0dbc40912259, numTrees=10, numFeatures=3


## To Do
Access the dataset at 
https://storage.googleapis.com/neurals/data/weather.csv

    Select MinTemp, MaxTemp, RainFall and Temp3pm as features
    Select Temp9am as Label
    
Prepare the data and train a model that can predict the label.
What is the accuracy of your approach?
How you can make it better?