In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.classification  import LogisticRegression, DecisionTreeClassifier
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.sql.functions import *

In [2]:
spark=SparkSession.builder.appName('SparkLogistic').getOrCreate()

In [3]:
df = spark.read.csv('sales_data.csv', header = True, inferSchema=True)
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Day: integer (nullable = true)
 |-- Month: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Customer_Age: integer (nullable = true)
 |-- Age_Group: string (nullable = true)
 |-- Customer_Gender: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Product_Category: string (nullable = true)
 |-- Sub_Category: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Order_Quantity: integer (nullable = true)
 |-- Unit_Cost: integer (nullable = true)
 |-- Unit_Price: integer (nullable = true)
 |-- Profit: integer (nullable = true)
 |-- Cost: integer (nullable = true)
 |-- Revenue: integer (nullable = true)



In [4]:
stringIndex = StringIndexer(inputCol='Customer_Gender', outputCol='Gender_Index')
indexed = stringIndex.fit(df).transform(df)

In [5]:
featureAssembler = VectorAssembler(inputCols=['Unit_Price', 'Profit','Cost','Revenue'], outputCol='Features')
df2 = featureAssembler.transform(indexed)

In [6]:
df3 = df2.select(['Features','Gender_Index'])
df3.show(13)

+--------------------+------------+
|            Features|Gender_Index|
+--------------------+------------+
|[120.0,590.0,360....|         0.0|
|[120.0,590.0,360....|         0.0|
|[120.0,1366.0,103...|         0.0|
|[120.0,1188.0,900...|         0.0|
|[120.0,238.0,180....|         1.0|
|[120.0,297.0,225....|         1.0|
|[120.0,199.0,180....|         1.0|
|[120.0,100.0,90.0...|         1.0|
|[120.0,1096.0,990...|         0.0|
|[120.0,1046.0,945...|         0.0|
|[120.0,398.0,360....|         1.0|
|[120.0,398.0,360....|         1.0|
|[120.0,349.0,315....|         0.0|
+--------------------+------------+
only showing top 13 rows



In [7]:
scaler = StandardScaler(inputCol='Features', outputCol='Scaler')
scaledModel = scaler.fit(df3).transform(df3)
scaledModel.show()

+--------------------+------------+--------------------+
|            Features|Gender_Index|              Scaler|
+--------------------+------------+--------------------+
|[120.0,590.0,360....|         0.0|[0.13014179109459...|
|[120.0,590.0,360....|         0.0|[0.13014179109459...|
|[120.0,1366.0,103...|         0.0|[0.13014179109459...|
|[120.0,1188.0,900...|         0.0|[0.13014179109459...|
|[120.0,238.0,180....|         1.0|[0.13014179109459...|
|[120.0,297.0,225....|         1.0|[0.13014179109459...|
|[120.0,199.0,180....|         1.0|[0.13014179109459...|
|[120.0,100.0,90.0...|         1.0|[0.13014179109459...|
|[120.0,1096.0,990...|         0.0|[0.13014179109459...|
|[120.0,1046.0,945...|         0.0|[0.13014179109459...|
|[120.0,398.0,360....|         1.0|[0.13014179109459...|
|[120.0,398.0,360....|         1.0|[0.13014179109459...|
|[120.0,349.0,315....|         0.0|[0.13014179109459...|
|[120.0,349.0,315....|         0.0|[0.13014179109459...|
|[120.0,369.0,225....|         

In [8]:
train_data, test_data = scaledModel.randomSplit([.8, .2], seed=1233)

In [9]:
train_data.groupBy('Gender_Index').agg(count('Scaler')).show()

+------------+-------------+
|Gender_Index|count(Scaler)|
+------------+-------------+
|         0.0|        46621|
|         1.0|        43728|
+------------+-------------+



### Inisialisasi Logistic Regression

In [10]:
lr = LogisticRegression(labelCol='Gender_Index', featuresCol='Scaler', maxIter=20, regParam=0.3)
lModel = lr.fit(train_data)

In [11]:
pred = lModel.transform(test_data)

In [12]:
pred.printSchema()

root
 |-- Features: vector (nullable = true)
 |-- Gender_Index: double (nullable = false)
 |-- Scaler: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [13]:
selected =pred.select("Gender_Index", "prediction", "probability")
selected.show(20)

+------------+----------+--------------------+
|Gender_Index|prediction|         probability|
+------------+----------+--------------------+
|         0.0|       0.0|[0.51818049614913...|
|         0.0|       0.0|[0.51818049614913...|
|         0.0|       0.0|[0.51818049614913...|
|         0.0|       0.0|[0.51818049614913...|
|         0.0|       0.0|[0.51818049614913...|
|         0.0|       0.0|[0.51818049614913...|
|         0.0|       0.0|[0.51818049614913...|
|         0.0|       0.0|[0.51818049614913...|
|         0.0|       0.0|[0.51818049614913...|
|         0.0|       0.0|[0.51818049614913...|
|         0.0|       0.0|[0.51818049614913...|
|         0.0|       0.0|[0.51818049614913...|
|         0.0|       0.0|[0.51818049614913...|
|         0.0|       0.0|[0.51818049614913...|
|         0.0|       0.0|[0.51818049614913...|
|         0.0|       0.0|[0.51818049614913...|
|         0.0|       0.0|[0.51818049614913...|
|         0.0|       0.0|[0.51818049614913...|
|         0.0

### Evaluated

In [14]:
pred.groupBy('prediction').agg(count('prediction')).show()

+----------+-----------------+
|prediction|count(prediction)|
+----------+-----------------+
|       0.0|            22531|
|       1.0|              156|
+----------+-----------------+



In [15]:
pred.groupBy('Gender_Index').agg(count('Gender_Index')).show()

+------------+-------------------+
|Gender_Index|count(Gender_Index)|
+------------+-------------------+
|         0.0|              11691|
|         1.0|              10996|
+------------+-------------------+



In [16]:
cm = pred.select("Gender_Index", "prediction")

In [17]:
cm.filter(cm.Gender_Index == cm.prediction).count() / cm.count()

0.5151408295499625

In [18]:
### Use ROC 
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Evaluate model
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='Gender_Index')
print(evaluator.evaluate(pred))
print(evaluator.getMetricName())

0.5027836538968657
areaUnderROC


### Decision Tree Classifier

In [19]:
dt = DecisionTreeClassifier(featuresCol = 'Features', labelCol = 'Gender_Index', maxDepth = 3)
dtModel = dt.fit(train_data)

In [20]:
predictions = dtModel.transform(test_data)
predictions.printSchema()

root
 |-- Features: vector (nullable = true)
 |-- Gender_Index: double (nullable = false)
 |-- Scaler: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [21]:
selected1 =predictions.select("Gender_Index", "prediction", "probability")
selected1.show(20)

+------------+----------+--------------------+
|Gender_Index|prediction|         probability|
+------------+----------+--------------------+
|         0.0|       0.0|[0.52062311453240...|
|         0.0|       0.0|[0.52062311453240...|
|         0.0|       0.0|[0.52062311453240...|
|         0.0|       0.0|[0.52062311453240...|
|         0.0|       0.0|[0.52062311453240...|
|         0.0|       0.0|[0.52062311453240...|
|         0.0|       0.0|[0.52062311453240...|
|         0.0|       0.0|[0.52062311453240...|
|         0.0|       0.0|[0.52062311453240...|
|         0.0|       0.0|[0.52062311453240...|
|         0.0|       0.0|[0.52062311453240...|
|         0.0|       0.0|[0.52062311453240...|
|         0.0|       0.0|[0.52062311453240...|
|         0.0|       0.0|[0.52062311453240...|
|         0.0|       0.0|[0.52062311453240...|
|         0.0|       0.0|[0.52062311453240...|
|         0.0|       0.0|[0.52062311453240...|
|         0.0|       0.0|[0.52062311453240...|
|         0.0

In [22]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='Gender_Index')
print("Test Area Under ROC: " + str(evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})))

Test Area Under ROC: 0.49389818239828365


In [23]:
dtc = predictions.select("Gender_Index", "prediction")

In [24]:
dtc.filter(dtc.Gender_Index == dtc.prediction).count() / dtc.count()

0.5159342354652444

### Perbaikan

In [25]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer

In [27]:
labelIndexer = StringIndexer(inputCol="Gender_Index", outputCol="indexedLabel").fit(df3)

In [29]:
featureIndexer =VectorIndexer(inputCol="Features", outputCol="indexedFeatures", maxCategories=4).fit(df3)

In [30]:
# Split the data into training and test sets (30% held out for testing)
trainingData, testData = df3.randomSplit([0.7, 0.3])

In [31]:
# Train a GBT model.
gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)

In [32]:
 # Chain indexers and GBT in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt])

In [33]:
# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

In [34]:
 # Make predictions.
predictions = model.transform(testData)

In [35]:
# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)

+----------+------------+-----------------+
|prediction|indexedLabel|         features|
+----------+------------+-----------------+
|       1.0|         0.0|[5.0,2.0,2.0,4.0]|
|       1.0|         0.0|[5.0,2.0,2.0,4.0]|
|       1.0|         0.0|[5.0,2.0,2.0,4.0]|
|       1.0|         0.0|[5.0,2.0,2.0,4.0]|
|       1.0|         0.0|[5.0,2.0,2.0,4.0]|
+----------+------------+-----------------+
only showing top 5 rows



In [37]:
# Select (prediction, true label) and compute test error
evaluator = BinaryClassificationEvaluator(labelCol="indexedLabel", rawPredictionCol='rawPrediction')
accuracy = evaluator.evaluate(predictions)
print('Accuracy:', accuracy)

Accuracy: 0.5407045045816932
