# Project context:
## In this project, a company wants to find the percentage of effects of features A, B, C, D on Spoiled dog food

# Imports

In [1]:
import findspark

findspark.init('C:/spark')

In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

# Create a session

In [3]:
spark = SparkSession.builder.appName('treeProject').getOrCreate()

# Read data

In [6]:
data = spark.read.csv('../../data/dog_food.csv',
                      inferSchema=True,
                      header=True)

In [8]:
data.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)



# Create model

In [9]:
assembler = VectorAssembler(inputCols=['A', 'B', 'C', 'D'], outputCol='features')

In [12]:
data = assembler.transform(data)

In [14]:
data.printSchema()

root
 |-- A: integer (nullable = true)
 |-- B: integer (nullable = true)
 |-- C: double (nullable = true)
 |-- D: integer (nullable = true)
 |-- Spoiled: double (nullable = true)
 |-- features: vector (nullable = true)



In [15]:
rfc = RandomForestClassifier(labelCol='Spoiled', featuresCol='features')

In [16]:
rfc_model = rfc.fit(data)

In [17]:
rfc_model.featureImportances

SparseVector(4, {0: 0.017, 1: 0.0211, 2: 0.9401, 3: 0.0219})