In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Project").getOrCreate()

In [8]:
df = spark.read.csv("dog_food.csv",header=True,inferSchema=True)

In [9]:
df.show()

+---+---+----+---+-------+
|  A|  B|   C|  D|Spoiled|
+---+---+----+---+-------+
|  4|  2|12.0|  3|    1.0|
|  5|  6|12.0|  7|    1.0|
|  6|  2|13.0|  6|    1.0|
|  4|  2|12.0|  1|    1.0|
|  4|  2|12.0|  3|    1.0|
| 10|  3|13.0|  9|    1.0|
|  8|  5|14.0|  5|    1.0|
|  5|  8|12.0|  8|    1.0|
|  6|  5|12.0|  9|    1.0|
|  3|  3|12.0|  1|    1.0|
|  9|  8|11.0|  3|    1.0|
|  1| 10|12.0|  3|    1.0|
|  1|  5|13.0| 10|    1.0|
|  2| 10|12.0|  6|    1.0|
|  1| 10|11.0|  4|    1.0|
|  5|  3|12.0|  2|    1.0|
|  4|  9|11.0|  8|    1.0|
|  5|  1|11.0|  1|    1.0|
|  4|  9|12.0| 10|    1.0|
|  5|  8|10.0|  9|    1.0|
+---+---+----+---+-------+
only showing top 20 rows



In [10]:
df.columns

['A', 'B', 'C', 'D', 'Spoiled']

In [11]:
from pyspark.ml.feature import VectorAssembler

In [12]:
assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'],outputCol="features")

In [15]:
output= assembler.transform(df)

In [18]:
output.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)
 |-- features: vector (nullable = true)



In [19]:
final_data= output.select("features","Spoiled")

In [20]:
train_data,test_data =final_data.randomSplit([0.7,0.3])

In [21]:
from pyspark.ml.classification import RandomForestClassifier

In [22]:
rfc = RandomForestClassifier(featuresCol="features",labelCol="Spoiled")

In [23]:
rfc_model = rfc.fit(train_data)

In [24]:
rfc_preds = rfc_model.transform(test_data)

In [26]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [27]:
acc_eval = MulticlassClassificationEvaluator(labelCol= "Spoiled",metricName="accuracy")

In [28]:
acc_eval.evaluate(rfc_preds)

0.9622641509433962

In [30]:
rfc_preds.show()

+-------------------+-------+--------------------+--------------------+----------+
|           features|Spoiled|       rawPrediction|         probability|prediction|
+-------------------+-------+--------------------+--------------------+----------+
| [1.0,1.0,12.0,2.0]|    1.0|[1.84038634321653...|[0.09201931716082...|       1.0|
| [1.0,1.0,12.0,4.0]|    1.0|[1.26751277999814...|[0.06337563899990...|       1.0|
|  [1.0,3.0,8.0,3.0]|    0.0|[18.7251077115917...|[0.93625538557958...|       0.0|
|  [1.0,4.0,9.0,3.0]|    0.0|[18.7251077115917...|[0.93625538557958...|       0.0|
|[1.0,4.0,13.0,10.0]|    1.0|[0.06095238095238...|[0.00304761904761...|       1.0|
|  [1.0,5.0,8.0,5.0]|    0.0|[19.7251077115917...|[0.98625538557958...|       0.0|
|  [1.0,6.0,7.0,8.0]|    0.0|[19.7161791401631...|[0.98580895700815...|       0.0|
|[1.0,6.0,11.0,10.0]|    1.0|[0.06095238095238...|[0.00304761904761...|       1.0|
|  [1.0,7.0,7.0,2.0]|    0.0|[18.7251077115917...|[0.93625538557958...|       0.0|
|[1.

In [31]:
rfc_model.featureImportances

SparseVector(4, {0: 0.0248, 1: 0.0335, 2: 0.9159, 3: 0.0257})

In [66]:
import pandas as pd

In [77]:
pandasDF = pd.DataFrame(output.schema["features"].metadata["ml_attr"]["attrs"]["numeric"]
                        +output.schema["features"].metadata["ml_attr"]["attrs"]["numeric"]).sort_values("idx")

In [68]:
output.schema["features"].metadata

{'ml_attr': {'attrs': {'numeric': [{'idx': 0, 'name': 'A'},
    {'idx': 1, 'name': 'B'},
    {'idx': 2, 'name': 'C'},
    {'idx': 3, 'name': 'D'}]},
  'num_attrs': 4}}

In [85]:
feature_dict = dict(zip(pandasDF["idx"],pandasDF["name"]))

In [86]:
feature_dict

{0: 'A', 1: 'B', 2: 'C', 3: 'D'}

In [87]:
from pyspark.context import SparkContext as sc

In [88]:
feature_dict_broad = sc.broadcast(feature_dict)

TypeError: broadcast() missing 1 required positional argument: 'value'

In [83]:
pandasDF

Unnamed: 0,idx,name
0,0,A
4,0,A
1,1,B
5,1,B
2,2,C
6,2,C
3,3,D
7,3,D
