In [1]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.classification import GBTClassifier, DecisionTreeClassifier, RandomForestClassifier
from pyspark.ml.feature import VectorAssembler
import warnings
warnings.filterwarnings('ignore')

In [3]:
spark = SparkSession.builder.appName('dog_food').getOrCreate()

data  = spark.read.csv('dog_food.csv', inferSchema=True, header=True)
data.show()

                                                                                

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
| 10|  3|13.0|  9|    1.0|
|  8|  5|14.0|  5|    1.0|
|  5|  8|12.0|  8|    1.0|
|  6|  5|12.0|  9|    1.0|
|  3|  3|12.0|  1|    1.0|
|  9|  8|11.0|  3|    1.0|
|  1| 10|12.0|  3|    1.0|
|  1|  5|13.0| 10|    1.0|
|  2| 10|12.0|  6|    1.0|
|  1| 10|11.0|  4|    1.0|
|  5|  3|12.0|  2|    1.0|
|  4|  9|11.0|  8|    1.0|
|  5|  1|11.0|  1|    1.0|
|  4|  9|12.0| 10|    1.0|
|  5|  8|10.0|  9|    1.0|
+---+---+----+---+-------+
only showing top 20 rows



In [4]:
assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'], outputCol='features')
fixed_data = assembler.transform(data)

fixed_data.show()

[Stage 5:>                                                          (0 + 1) / 1]

+---+---+----+---+-------+-------------------+
|  A|  B|   C|  D|Spoiled|           features|
+---+---+----+---+-------+-------------------+
|  4|  2|12.0|  3|    1.0| [4.0,2.0,12.0,3.0]|
|  5|  6|12.0|  7|    1.0| [5.0,6.0,12.0,7.0]|
|  6|  2|13.0|  6|    1.0| [6.0,2.0,13.0,6.0]|
|  4|  2|12.0|  1|    1.0| [4.0,2.0,12.0,1.0]|
|  4|  2|12.0|  3|    1.0| [4.0,2.0,12.0,3.0]|
| 10|  3|13.0|  9|    1.0|[10.0,3.0,13.0,9.0]|
|  8|  5|14.0|  5|    1.0| [8.0,5.0,14.0,5.0]|
|  5|  8|12.0|  8|    1.0| [5.0,8.0,12.0,8.0]|
|  6|  5|12.0|  9|    1.0| [6.0,5.0,12.0,9.0]|
|  3|  3|12.0|  1|    1.0| [3.0,3.0,12.0,1.0]|
|  9|  8|11.0|  3|    1.0| [9.0,8.0,11.0,3.0]|
|  1| 10|12.0|  3|    1.0|[1.0,10.0,12.0,3.0]|
|  1|  5|13.0| 10|    1.0|[1.0,5.0,13.0,10.0]|
|  2| 10|12.0|  6|    1.0|[2.0,10.0,12.0,6.0]|
|  1| 10|11.0|  4|    1.0|[1.0,10.0,11.0,4.0]|
|  5|  3|12.0|  2|    1.0| [5.0,3.0,12.0,2.0]|
|  4|  9|11.0|  8|    1.0| [4.0,9.0,11.0,8.0]|
|  5|  1|11.0|  1|    1.0| [5.0,1.0,11.0,1.0]|
|  4|  9|12.0

                                                                                

In [5]:
final_data = fixed_data.select(['features' , 'Spoiled'])

#splitting the data into train and test
train_data , test_data = final_data.randomSplit([0.7, 0.3])
train_data.describe().show()

                                                                                

+-------+-------------------+
|summary|            Spoiled|
+-------+-------------------+
|  count|                357|
|   mean| 0.2773109243697479|
| stddev|0.44829959161368554|
|    min|                0.0|
|    max|                1.0|
+-------+-------------------+



In [11]:
dtc = DecisionTreeClassifier(labelCol='Spoiled', featuresCol='features')
rfc = RandomForestClassifier(numTrees=100, labelCol='Spoiled', featuresCol='features')
gbt = GBTClassifier(labelCol='Spoiled', featuresCol='features')

In [12]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

                                                                                

In [13]:
dtc_preds = dtc_model.transform(train_data)
rfc_preds = rfc_model.transform(train_data)
gbt_preds = gbt_model.transform(train_data)

In [14]:
#Evaluating the results
tree_eval = BinaryClassificationEvaluator(labelCol='Spoiled')

dtc_results = tree_eval.evaluate(dtc_preds)
print(dtc_results)

0.9827343199436223


In [15]:
rfc_results = tree_eval.evaluate(rfc_preds)
print(rfc_results)

0.9996672147834939


In [16]:
gbt_results = tree_eval.evaluate(gbt_preds)
print(gbt_results)

22/02/17 12:38:14 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/02/17 12:38:14 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


0.9999804243990291
