In [1]:
import findspark

In [2]:
findspark.init('/home/guha/spark-2.4.4-bin-hadoop2.7')

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('drg').getOrCreate()

In [5]:
data = spark.read.csv('Ecommerce_Customers.csv',inferSchema=True,header=True)

In [6]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- _c8: string (nullable = true)
 |-- Private: string (nullable = true)



In [7]:
data.show()

+--------------------+--------------------+----------------+------------------+-----------+---------------+--------------------+-------------------+----+-------+
|               Email|             Address|          Avatar|Avg Session Length|Time on App|Time on Website|Length of Membership|Yearly Amount Spent| _c8|Private|
+--------------------+--------------------+----------------+------------------+-----------+---------------+--------------------+-------------------+----+-------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet|       34.49726773|12.65565115|    39.57766802|         4.082620633|         587.951054|null|     No|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen|       31.92627203|11.10946073|    37.26895887|         2.664034182|        392.2049334|null|    Yes|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|       33.00091476|11.33027806|    37.11059744|         4.104543202|        487.5475049|null|     No|
|riverarebecca@gma...|1414 D

In [8]:
from pyspark.ml.feature import VectorAssembler

In [9]:
data.columns

['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent',
 '_c8',
 'Private']

In [10]:
assembler = VectorAssembler(
    inputCols=["Avg Session Length", "Time on App", 
               "Time on Website",'Length of Membership'],
    outputCol="features")

In [11]:
output = assembler.transform(data)

In [12]:
from pyspark.ml.feature import StringIndexer

In [13]:
indexer = StringIndexer(inputCol='Private',outputCol='PrivateIndex')

In [14]:
output_fixed = indexer.fit(output).transform(output)

In [15]:
output_fixed.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)
 |-- _c8: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)



In [16]:
final_data = output_fixed.select('features','PrivateIndex')

In [17]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

In [18]:
from pyspark.ml.classification import (DecisionTreeClassifier,GBTClassifier,RandomForestClassifier)

In [19]:
from pyspark.ml import Pipeline

In [20]:
dtc = DecisionTreeClassifier(labelCol='PrivateIndex',featuresCol='features')
rfc= RandomForestClassifier(labelCol='PrivateIndex',featuresCol='features')
gbt = GBTClassifier(labelCol='PrivateIndex',featuresCol='features')

In [21]:
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

In [22]:
dtc_preds = dtc_model.transform(test_data)
rfc_preds = rfc_model.transform(test_data)
gbt_preds = gbt_model.transform(test_data)

In [23]:
dtc_preds.show()

+--------------------+------------+-------------+--------------------+----------+
|            features|PrivateIndex|rawPrediction|         probability|prediction|
+--------------------+------------+-------------+--------------------+----------+
|[30.39318454,11.8...|         1.0|   [7.0,95.0]|[0.06862745098039...|       1.0|
|[30.83643267,13.1...|         1.0|    [0.0,8.0]|           [0.0,1.0]|       1.0|
|[30.97167564,11.7...|         1.0|    [3.0,0.0]|           [1.0,0.0]|       0.0|
|[31.06621816,11.7...|         1.0|    [0.0,9.0]|           [0.0,1.0]|       1.0|
|[31.26810421,12.1...|         1.0|    [0.0,8.0]|           [0.0,1.0]|       1.0|
|[31.28344748,12.7...|         0.0|    [3.0,0.0]|           [1.0,0.0]|       0.0|
|[31.3123496,11.68...|         1.0|    [0.0,9.0]|           [0.0,1.0]|       1.0|
|[31.57020083,13.3...|         0.0|    [1.0,1.0]|           [0.5,0.5]|       0.0|
|[31.57613197,12.5...|         0.0|    [3.0,0.0]|           [1.0,0.0]|       0.0|
|[31.6005122,12.

In [24]:
rfc_preds.show()

+--------------------+------------+--------------------+--------------------+----------+
|            features|PrivateIndex|       rawPrediction|         probability|prediction|
+--------------------+------------+--------------------+--------------------+----------+
|[30.39318454,11.8...|         1.0|[0.59990595028591...|[0.02999529751429...|       1.0|
|[30.83643267,13.1...|         1.0|[3.33016533016533...|[0.16650826650826...|       1.0|
|[30.97167564,11.7...|         1.0|[6.84713866690610...|[0.34235693334530...|       1.0|
|[31.06621816,11.7...|         1.0|[3.16099148075892...|[0.15804957403794...|       1.0|
|[31.26810421,12.1...|         1.0|[1.44862204712887...|[0.07243110235644...|       1.0|
|[31.28344748,12.7...|         0.0|[13.9140350877192...|[0.69570175438596...|       0.0|
|[31.3123496,11.68...|         1.0|[1.76987970481380...|[0.08849398524069...|       1.0|
|[31.57020083,13.3...|         0.0|[12.0841623135740...|[0.60420811567870...|       0.0|
|[31.57613197,12.5...

In [25]:
gbt_preds.show()

+--------------------+------------+--------------------+--------------------+----------+
|            features|PrivateIndex|       rawPrediction|         probability|prediction|
+--------------------+------------+--------------------+--------------------+----------+
|[30.39318454,11.8...|         1.0|[-1.5375150800900...|[0.04414906778345...|       1.0|
|[30.83643267,13.1...|         1.0|[-1.9259033288093...|[0.02079951663574...|       1.0|
|[30.97167564,11.7...|         1.0|[0.58777718868175...|[0.76414752768915...|       0.0|
|[31.06621816,11.7...|         1.0|[-1.4163648811446...|[0.05558093412695...|       1.0|
|[31.26810421,12.1...|         1.0|[-1.9342719114487...|[0.02046135264292...|       1.0|
|[31.28344748,12.7...|         0.0|[1.54856838391803...|[0.95677448491356...|       0.0|
|[31.3123496,11.68...|         1.0|[-1.3397325499117...|[0.06419600297720...|       1.0|
|[31.57020083,13.3...|         0.0|[0.52650332238968...|[0.74135183941010...|       0.0|
|[31.57613197,12.5...

In [26]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [27]:
my_bin_eval = BinaryClassificationEvaluator(labelCol='PrivateIndex')

In [28]:
print('DTC')
print(my_bin_eval.evaluate(dtc_preds))

DTC
0.8635294117647058


In [29]:
print("Test Error = %g" % (1.0 - my_bin_eval.evaluate(dtc_preds) ))

Test Error = 0.136471


In [30]:
print('rfC')
print(my_bin_eval.evaluate(rfc_preds))

rfC
0.9427450980392158


In [31]:
print("Test Error = %g" % (1.0 - my_bin_eval.evaluate(rfc_preds) ))

Test Error = 0.0572549


In [32]:
print('GBT')
print(my_bin_eval.evaluate(gbt_preds))

GBT
0.9304901960784313


In [33]:
print("Test Error = %g" % (1.0 - my_bin_eval.evaluate(gbt_preds) ))

Test Error = 0.0695098


In [34]:
my_bin_eval2 = BinaryClassificationEvaluator(labelCol='PrivateIndex',rawPredictionCol='prediction')

In [35]:
print('DTC')
print(my_bin_eval2.evaluate(dtc_preds))

DTC
0.8411764705882352


In [36]:
print("Test Error = %g" % (1.0 - my_bin_eval2.evaluate(dtc_preds) ))

Test Error = 0.158824


In [37]:
print('RFC')
print(my_bin_eval2.evaluate(rfc_preds))

RFC
0.8696078431372549


In [38]:
print("Test Error = %g" % (1.0 - my_bin_eval2.evaluate(rfc_preds) ))

Test Error = 0.130392


In [39]:
print('GBT')
print(my_bin_eval2.evaluate(gbt_preds))

GBT
0.8637254901960785


In [40]:
print("Test Error = %g" % (1.0 - my_bin_eval2.evaluate(gbt_preds) ))

Test Error = 0.136275


In [41]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [42]:
acc_eval = MulticlassClassificationEvaluator(labelCol='PrivateIndex',metricName='accuracy')

In [43]:
dtc_acc = acc_eval.evaluate(dtc_preds)
rfc_acc = acc_eval.evaluate(rfc_preds)
gbt_acc = acc_eval.evaluate(gbt_preds)

In [44]:
dtc_acc

0.8482758620689655

In [45]:
rfc_acc

0.8758620689655172

In [46]:
gbt_acc

0.8689655172413793

In [47]:
print("error rate:",1-dtc_acc)

error rate: 0.15172413793103445


In [48]:
print("error rate:",1-rfc_acc)

error rate: 0.12413793103448278


In [49]:
print("error rate:",1-gbt_acc)

error rate: 0.13103448275862073


In [50]:
from pyspark.ml.classification import LogisticRegression

In [61]:
lr_churn = LogisticRegression(labelCol='PrivateIndex')

In [62]:
fitted_churn_model = lr_churn.fit(train_data)

In [63]:
training_sum = fitted_churn_model.summary

In [64]:
training_sum.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|       PrivateIndex|         prediction|
+-------+-------------------+-------------------+
|  count|                355|                355|
|   mean|0.48169014084507045|0.48169014084507045|
| stddev| 0.5003698798992924|  0.500369879899292|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



In [74]:
pred_and_labels = fitted_churn_model.evaluate(test_data)

In [75]:
pred_and_labels.predictions.show()

+--------------------+------------+--------------------+--------------------+----------+
|            features|PrivateIndex|       rawPrediction|         probability|prediction|
+--------------------+------------+--------------------+--------------------+----------+
|[30.39318454,11.8...|         1.0|[-11.498428753218...|[1.01459200460736...|       1.0|
|[30.83643267,13.1...|         1.0|[-2.4785060739502...|[0.07737878816185...|       1.0|
|[30.97167564,11.7...|         1.0|[-1.1092673861925...|[0.24800749564712...|       1.0|
|[31.06621816,11.7...|         1.0|[-2.6466956942616...|[0.06619296089362...|       1.0|
|[31.26810421,12.1...|         1.0|[-5.1905252228963...|[0.00553823814075...|       1.0|
|[31.28344748,12.7...|         0.0|[4.16891105545649...|[0.98476655172961...|       0.0|
|[31.3123496,11.68...|         1.0|[-3.1421734088255...|[0.04140077773704...|       1.0|
|[31.57020083,13.3...|         0.0|[3.85977135756776...|[0.97936208196001...|       0.0|
|[31.57613197,12.5...

In [85]:
eva1 = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='PrivateIndex')

In [86]:
eva1.evaluate(pred_and_labels.predictions)

0.8838235294117648

In [93]:
eva2 = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='PrivateIndex',
                                             metricName='accuracy')

In [94]:
eva2.evaluate(pred_and_labels.predictions)

0.8896551724137931