In [1]:
import findspark

In [2]:
findspark.init('/home/guha/spark-2.4.4-bin-hadoop2.7')

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('drg').getOrCreate()

In [5]:
data = spark.read.csv('Ecommerce_Customers.csv',inferSchema=True,header=True)

In [6]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- _c8: string (nullable = true)
 |-- Private: string (nullable = true)



In [7]:
data.show()

+--------------------+--------------------+----------------+------------------+-----------+---------------+--------------------+-------------------+----+-------+
|               Email|             Address|          Avatar|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent| _c8|Private|
+--------------------+--------------------+----------------+------------------+-----------+---------------+--------------------+-------------------+----+-------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|null|     No|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|null|    Yes|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|null|     No|
|riverarebecca@gma...|1414 D

In [8]:
from pyspark.ml.feature import VectorAssembler

In [9]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent',
 '_c8',
 'Private']

In [10]:
assembler = VectorAssembler(
    inputCols=["Avg Session Length", "Time on App", 
               "Time on Website",'Length of Membership'],
    outputCol="features")

In [11]:
output = assembler.transform(data)

In [12]:
from pyspark.ml.feature import StringIndexer

In [13]:
indexer = StringIndexer(inputCol='Private',outputCol='PrivateIndex')

In [14]:
output_fixed = indexer.fit(output).transform(output)

In [15]:
output_fixed.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- _c8: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)



In [16]:
final_data = output_fixed.select('features','PrivateIndex')

In [17]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [18]:
from pyspark.ml.classification import (DecisionTreeClassifier,GBTClassifier,RandomForestClassifier)

In [19]:
from pyspark.ml import Pipeline

In [20]:
dtc = DecisionTreeClassifier(labelCol='PrivateIndex',featuresCol='features')
rfc= RandomForestClassifier(labelCol='PrivateIndex',featuresCol='features')
gbt = GBTClassifier(labelCol='PrivateIndex',featuresCol='features')

In [21]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [22]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [23]:
dtc_preds.show()

+--------------------+------------+-------------+--------------------+----------+
|            features|PrivateIndex|rawPrediction|         probability|prediction|
+--------------------+------------+-------------+--------------------+----------+
|[30.57436368,11.3...|         1.0|    [0.0,5.0]|           [0.0,1.0]|       1.0|
|[30.73772037,12.6...|         1.0|    [0.0,9.0]|           [0.0,1.0]|       1.0|
|[31.06132516,12.3...|         1.0|    [0.0,5.0]|           [0.0,1.0]|       1.0|
|[31.06621816,11.7...|         1.0|    [0.0,5.0]|           [0.0,1.0]|       1.0|
|[31.26064687,13.2...|         1.0|   [1.0,20.0]|[0.04761904761904...|       1.0|
|[31.38958548,10.9...|         1.0|   [0.0,19.0]|           [0.0,1.0]|       1.0|
|[31.44744649,10.1...|         1.0|    [0.0,3.0]|           [0.0,1.0]|       1.0|
|[31.5171218,10.74...|         1.0|   [1.0,57.0]|[0.01724137931034...|       1.0|
|[31.52575242,11.3...|         1.0|    [0.0,5.0]|           [0.0,1.0]|       1.0|
|[31.53160448,13

In [24]:
rfc_preds.show()

+--------------------+------------+--------------------+--------------------+----------+
|            features|PrivateIndex|       rawPrediction|         probability|prediction|
+--------------------+------------+--------------------+--------------------+----------+
|[30.57436368,11.3...|         1.0|[5.25571152652272...|[0.26278557632613...|       1.0|
|[30.73772037,12.6...|         1.0|[3.65978595587864...|[0.18298929779393...|       1.0|
|[31.06132516,12.3...|         1.0|[5.02530216979758...|[0.25126510848987...|       1.0|
|[31.06621816,11.7...|         1.0|[5.18904485985606...|[0.25945224299280...|       1.0|
|[31.26064687,13.2...|         1.0|[1.47692307692307...|[0.07384615384615...|       1.0|
|[31.38958548,10.9...|         1.0|[1.59545817369293...|[0.07977290868464...|       1.0|
|[31.44744649,10.1...|         1.0|[8.01719019462921...|[0.40085950973146...|       1.0|
|[31.5171218,10.74...|         1.0|[0.91500837685255...|[0.04575041884262...|       1.0|
|[31.52575242,11.3...

In [25]:
gbt_preds.show()

+--------------------+------------+--------------------+--------------------+----------+
|            features|PrivateIndex|       rawPrediction|         probability|prediction|
+--------------------+------------+--------------------+--------------------+----------+
|[30.57436368,11.3...|         1.0|[-1.3596551768314...|[0.06184346666392...|       1.0|
|[30.73772037,12.6...|         1.0|[-1.5486530851644...|[0.04321850963778...|       1.0|
|[31.06132516,12.3...|         1.0|[-1.3986146881011...|[0.05747407894401...|       1.0|
|[31.06621816,11.7...|         1.0|[-1.3986146881011...|[0.05747407894401...|       1.0|
|[31.26064687,13.2...|         1.0|[-1.4596167804789...|[0.05121092813321...|       1.0|
|[31.38958548,10.9...|         1.0|[-1.3895779861859...|[0.05846099630538...|       1.0|
|[31.44744649,10.1...|         1.0|[-1.7648191335937...|[0.02848059493131...|       1.0|
|[31.5171218,10.74...|         1.0|[-1.4078323486835...|[0.05648353172334...|       1.0|
|[31.52575242,11.3...

In [26]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [27]:
my_bin_eval = BinaryClassificationEvaluator(labelCol='PrivateIndex')

In [28]:
print('DTC')
print(my_bin_eval.evaluate(dtc_preds))

DTC
0.8287712287712288


In [29]:
print("Test Error = %g" % (1.0 - my_bin_eval.evaluate(dtc_preds) ))

Test Error = 0.171229


In [30]:
print('rfC')
print(my_bin_eval.evaluate(rfc_preds))

rfC
0.9587412587412587


In [31]:
print("Test Error = %g" % (1.0 - my_bin_eval.evaluate(rfc_preds) ))

Test Error = 0.0412587


In [32]:
print('GBT')
print(my_bin_eval.evaluate(gbt_preds))

GBT
0.9343656343656344


In [33]:
print("Test Error = %g" % (1.0 - my_bin_eval.evaluate(gbt_preds) ))

Test Error = 0.0656344


In [34]:
my_bin_eval2 = BinaryClassificationEvaluator(labelCol='PrivateIndex',rawPredictionCol='prediction')

In [35]:
print('DTC')
print(my_bin_eval2.evaluate(dtc_preds))

DTC
0.8362637362637363


In [36]:
print("Test Error = %g" % (1.0 - my_bin_eval2.evaluate(dtc_preds) ))

Test Error = 0.163736


In [37]:
print('RFC')
print(my_bin_eval2.evaluate(rfc_preds))

RFC
0.8942057942057942


In [38]:
print("Test Error = %g" % (1.0 - my_bin_eval2.evaluate(rfc_preds) ))

Test Error = 0.105794


In [39]:
print('GBT')
print(my_bin_eval2.evaluate(gbt_preds))

GBT
0.8297702297702296


In [40]:
print("Test Error = %g" % (1.0 - my_bin_eval2.evaluate(gbt_preds) ))

Test Error = 0.17023


In [41]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [42]:
acc_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex',metricName='accuracy')

In [44]:
dtc_acc = acc_eval.evaluate(dtc_preds)
rfc_acc = acc_eval.evaluate(rfc_preds)
gbt_acc = acc_eval.evaluate(gbt_preds)

In [45]:
dtc_acc

0.8380281690140845

In [46]:
rfc_acc

0.8943661971830986

In [47]:
gbt_acc

0.8309859154929577

In [48]:
print("error rate:",1-dtc_acc)

error rate: 0.1619718309859155


In [49]:
print("error rate:",1-rfc_acc)

error rate: 0.10563380281690138


In [63]:
print("error rate:",1-gbt_acc)

error rate: 0.16901408450704225


In [None]:
#logistic regression

In [51]:
from pyspark.ml.classification import LogisticRegression

In [52]:
lr = LogisticRegression(labelCol='PrivateIndex')

In [53]:
fit_model = lr.fit(train_data)

In [54]:
train_sum = fit_model.summary

In [55]:
train_sum.predictions.describe().show()

+-------+-------------------+------------------+
|summary|       PrivateIndex|        prediction|
+-------+-------------------+------------------+
|  count|                358|               358|
|   mean|0.46368715083798884|0.4581005586592179|
| stddev|0.49937757606433764|0.4989386729683723|
|    min|                0.0|               0.0|
|    max|                1.0|               1.0|
+-------+-------------------+------------------+



In [56]:
pl = fit_model.evaluate(test_data)

In [57]:
pl.predictions.show()

+--------------------+------------+--------------------+--------------------+----------+
|            features|PrivateIndex|       rawPrediction|         probability|prediction|
+--------------------+------------+--------------------+--------------------+----------+
|[30.57436368,11.3...|         1.0|[-4.0532762178330...|[0.01706897909141...|       1.0|
|[30.73772037,12.6...|         1.0|[-3.6101437152721...|[0.02633563421754...|       1.0|
|[31.06132516,12.3...|         1.0|[-0.6530914180502...|[0.34229323186472...|       1.0|
|[31.06621816,11.7...|         1.0|[-2.6452603866882...|[0.06628173459938...|       1.0|
|[31.26064687,13.2...|         1.0|[-5.3733720142478...|[0.00461704793296...|       1.0|
|[31.38958548,10.9...|         1.0|[-5.7991833864377...|[0.00302087476198...|       1.0|
|[31.44744649,10.1...|         1.0|[-4.5587676841338...|[0.01036637212411...|       1.0|
|[31.5171218,10.74...|         1.0|[-14.213453184397...|[6.71700113232204...|       1.0|
|[31.52575242,11.3...

In [58]:
eva1 = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='PrivateIndex')

In [60]:
eva1.evaluate(pl.predictions)

0.9148851148851148

In [61]:
eva2 = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='PrivateIndex',
                                             metricName='accuracy')

In [62]:
eva2.evaluate(pl.predictions)

0.9154929577464789