In [1]:
import findspark 
findspark.init("/Users/valentinaporcu/spark/spark-2.4.1-bin-hadoop2.7")
import pyspark 
from pyspark.sql import DataFrameNaFunctions 
from pyspark.sql.functions import lit 
from pyspark.ml.feature import StringIndexer  
from pyspark.ml import Pipeline 
from pyspark.sql import SparkSession
from pyspark.sql import functions
import pandas as pd
import numpy as np

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("metodi ensemble") \
    .config("spark.executor.memory", "6gb") \
    .getOrCreate()

In [3]:
# Random Forest

In [4]:
from pyspark.ml.classification import RandomForestClassifier

In [5]:
df = spark.read.csv("/Users/valentinaporcu/R_test/Wine.csv", 
                     sep = ',', inferSchema=True, header=True)

In [6]:
df.head()

Row(Alcohol=14.23, Malic_Acid=1.71, Ash=2.43, Ash_Alcanity=15.6, Magnesium=127, Total_Phenols=2.8, Flavanoids=3.06, Nonflavanoid_Phenols=0.28, Proanthocyanins=2.29, Color_Intensity=5.64, Hue=1.04, OD280=3.92, Proline=1065, Customer_Segment=1)

In [7]:
df.printSchema()

root
 |-- Alcohol: double (nullable = true)
 |-- Malic_Acid: double (nullable = true)
 |-- Ash: double (nullable = true)
 |-- Ash_Alcanity: double (nullable = true)
 |-- Magnesium: integer (nullable = true)
 |-- Total_Phenols: double (nullable = true)
 |-- Flavanoids: double (nullable = true)
 |-- Nonflavanoid_Phenols: double (nullable = true)
 |-- Proanthocyanins: double (nullable = true)
 |-- Color_Intensity: double (nullable = true)
 |-- Hue: double (nullable = true)
 |-- OD280: double (nullable = true)
 |-- Proline: integer (nullable = true)
 |-- Customer_Segment: integer (nullable = true)



In [8]:
df.columns

['Alcohol',
 'Malic_Acid',
 'Ash',
 'Ash_Alcanity',
 'Magnesium',
 'Total_Phenols',
 'Flavanoids',
 'Nonflavanoid_Phenols',
 'Proanthocyanins',
 'Color_Intensity',
 'Hue',
 'OD280',
 'Proline',
 'Customer_Segment']

In [9]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [10]:
assembler = VectorAssembler(inputCols = ['Alcohol',
 'Malic_Acid',
 'Ash',
 'Ash_Alcanity',
 'Magnesium',
 'Total_Phenols',
 'Flavanoids',
 'Nonflavanoid_Phenols',
 'Proanthocyanins',
 'Color_Intensity',
 'Hue',
 'OD280',
 'Proline'], outputCol = "features")

In [11]:
output = assembler.transform(df)

In [12]:
output.show()

+-------+----------+----+------------+---------+-------------+----------+--------------------+---------------+---------------+----+-----+-------+----------------+--------------------+
|Alcohol|Malic_Acid| Ash|Ash_Alcanity|Magnesium|Total_Phenols|Flavanoids|Nonflavanoid_Phenols|Proanthocyanins|Color_Intensity| Hue|OD280|Proline|Customer_Segment|            features|
+-------+----------+----+------------+---------+-------------+----------+--------------------+---------------+---------------+----+-----+-------+----------------+--------------------+
|  14.23|      1.71|2.43|        15.6|      127|          2.8|      3.06|                0.28|           2.29|           5.64|1.04| 3.92|   1065|               1|[14.23,1.71,2.43,...|
|   13.2|      1.78|2.14|        11.2|      100|         2.65|      2.76|                0.26|           1.28|           4.38|1.05|  3.4|   1050|               1|[13.2,1.78,2.14,1...|
|  13.16|      2.36|2.67|        18.6|      101|          2.8|      3.24|       

In [13]:
transformed_df = output.select('features','Customer_Segment')

In [14]:
train, test = transformed_df.randomSplit([0.7,0.3])

In [15]:
train.show()

+--------------------+----------------+
|            features|Customer_Segment|
+--------------------+----------------+
|[11.03,1.51,2.2,2...|               2|
|[11.41,0.74,2.5,2...|               2|
|[11.45,2.4,2.42,2...|               2|
|[11.46,3.74,1.82,...|               2|
|[11.56,2.05,3.23,...|               2|
|[11.61,1.35,2.7,2...|               2|
|[11.62,1.99,2.28,...|               2|
|[11.64,2.06,2.46,...|               2|
|[11.65,1.67,2.62,...|               2|
|[11.66,1.88,1.92,...|               2|
|[11.76,2.68,2.92,...|               2|
|[11.79,2.13,2.78,...|               2|
|[11.81,2.12,2.74,...|               2|
|[11.84,0.89,2.58,...|               2|
|[11.84,2.89,2.23,...|               2|
|[11.87,4.31,2.39,...|               2|
|[11.96,1.09,2.3,2...|               2|
|[12.0,0.92,2.0,19...|               2|
|[12.0,3.43,2.0,19...|               2|
|[12.04,4.3,2.38,2...|               2|
+--------------------+----------------+
only showing top 20 rows



In [16]:
rf = RandomForestClassifier(featuresCol = 'features', 
                            labelCol = 'Customer_Segment', 
                            numTrees=50)
rf_model = rf.fit(train)

In [17]:
rf_predictions = rf_model.transform(test)

In [18]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [19]:
evaluator = MulticlassClassificationEvaluator(labelCol="Customer_Segment",
                                              predictionCol="prediction", 
                                              metricName="accuracy")

In [20]:
rf_acc = evaluator.evaluate(rf_predictions)

In [21]:
rf_acc

1.0

In [None]:
spark.stop()