In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.types import DoubleType,IntegerType
import pyspark.sql.functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import SparseVector
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
spark = SparkSession.builder.appName("Classification").getOrCreate()

In [10]:
adult_data_df = spark.read.load("adult.csv", format="csv",delimiter=",", header=True)
adult_data_df.show()

+---+----------------+------+------------+---------------+------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+------+
|age|       workclass|fnlwgt|   education|educational-num|    marital-status|       occupation| relationship|              race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+----------------+------+------------+---------------+------------------+-----------------+-------------+------------------+------+------------+------------+--------------+--------------+------+
| 25|         Private|226802|        11th|              7|     Never-married|Machine-op-inspct|    Own-child|             Black|  Male|           0|           0|            40| United-States| <=50K|
| 38|         Private| 89814|     HS-grad|              9|Married-civ-spouse|  Farming-fishing|      Husband|             White|  Male|           0|           0|            50| United-States| <=50K|
| 28|

In [11]:
adult_data_df.printSchema()

root
 |-- age: string (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: string (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: string (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: string (nullable = true)
 |-- capital-loss: string (nullable = true)
 |-- hours-per-week: string (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)



In [14]:
adult_data_df = adult_data_df.withColumn("age", adult_data_df["age"].cast(IntegerType()))
adult_data_df = adult_data_df.withColumn("fnlwgt", adult_data_df["fnlwgt"].cast(IntegerType()))
adult_data_df = adult_data_df.withColumn("educational-num", adult_data_df["educational-num"].cast(IntegerType()))
adult_data_df = adult_data_df.withColumn("capital-gain", adult_data_df["capital-gain"].cast(IntegerType()))
adult_data_df = adult_data_df.withColumn("capital-loss", adult_data_df["capital-loss"].cast(IntegerType()))
adult_data_df = adult_data_df.withColumn("hours-per-week", adult_data_df["hours-per-week"].cast(IntegerType()))

In [15]:
adult_data_df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)



In [19]:
adult_data_df.select(['hours-per-week']).show()

+--------------+
|hours-per-week|
+--------------+
|            40|
|            50|
|            40|
|            40|
|            30|
|            30|
|            40|
|            32|
|            40|
|            10|
|            40|
|            40|
|            39|
|            35|
|            48|
|            50|
|            25|
|            30|
|            20|
|            45|
+--------------+
only showing top 20 rows



In [20]:
adult_data_df = adult_data_df.withColumn("label", adult_data_df['hours-per-week'] - 0)
adult_data_df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- label: integer (nullable = true)



In [21]:
adult_data_df.select(['label']).show()

+-----+
|label|
+-----+
|   40|
|   50|
|   40|
|   40|
|   30|
|   30|
|   40|
|   32|
|   40|
|   10|
|   40|
|   40|
|   39|
|   35|
|   48|
|   50|
|   25|
|   30|
|   20|
|   45|
+-----+
only showing top 20 rows



In [22]:
assem = VectorAssembler(inputCols=adult_data_df.columns[10:13], outputCol='features')
x = assem.transform(adult_data_df)
x.show(5)

+---+---------+------+------------+---------------+------------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+-----+-----------------+
|age|workclass|fnlwgt|   education|educational-num|    marital-status|       occupation|relationship| race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|label|         features|
+---+---------+------+------------+---------------+------------------+-----------------+------------+-----+------+------------+------------+--------------+--------------+------+-----+-----------------+
| 25|  Private|226802|        11th|              7|     Never-married|Machine-op-inspct|   Own-child|Black|  Male|           0|           0|            40| United-States| <=50K|   40|   [0.0,0.0,40.0]|
| 38|  Private| 89814|     HS-grad|              9|Married-civ-spouse|  Farming-fishing|     Husband|White|  Male|           0|           0|            50| United-States| <=50K|   50|   [0.0,0

In [24]:
train,test = x.randomSplit([0.6, 0.4], 1234)
nb1 = NaiveBayes(smoothing=1.0, modelType="multinomial")
model1 = nb1.fit(train)
predictions = model1.transform(test)
predictions.show(3)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

+---+---------+------+---------+---------------+--------------+----------+------------+-----+------+------------+------------+--------------+--------------+------+-----+--------------+--------------------+--------------------+----------+
|age|workclass|fnlwgt|education|educational-num|marital-status|occupation|relationship| race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|label|      features|       rawPrediction|         probability|prediction|
+---+---------+------+---------+---------------+--------------+----------+------------+-----+------+------------+------------+--------------+--------------+------+-----+--------------+--------------------+--------------------+----------+
| 17|        ?| 27251|     11th|              7|       Widowed|         ?|   Own-child|White|  Male|           0|           0|            40| United-States| <=50K|   40|[0.0,0.0,40.0]|[-216.22124837812...|[5.33104903932566...|       8.0|
| 17|        ?| 34088|     12th|              8|

In [25]:
nb2 = NaiveBayes(smoothing=10.0, modelType="multinomial")

# train the model
model2 = nb2.fit(train)

# select example rows to display.
predictions = model2.transform(test)
predictions.show(3)
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

+---+---------+------+---------+---------------+--------------+----------+------------+-----+------+------------+------------+--------------+--------------+------+-----+--------------+--------------------+--------------------+----------+
|age|workclass|fnlwgt|education|educational-num|marital-status|occupation|relationship| race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|label|      features|       rawPrediction|         probability|prediction|
+---+---------+------+---------+---------------+--------------+----------+------------+-----+------+------------+------------+--------------+--------------+------+-----+--------------+--------------------+--------------------+----------+
| 17|        ?| 27251|     11th|              7|       Widowed|         ?|   Own-child|White|  Male|           0|           0|            40| United-States| <=50K|   40|[0.0,0.0,40.0]|[-199.94766668123...|[3.45913667958139...|      79.0|
| 17|        ?| 34088|     12th|              8|

In [26]:
from pyspark.ml.classification import DecisionTreeClassifier
nb3 = DecisionTreeClassifier(labelCol="label", featuresCol="features")
# train the model
model3 = nb3.fit(train)
# select example rows to display.
predictions = model3.transform(test)
predictions.show(3)
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

+---+---------+------+---------+---------------+--------------+----------+------------+-----+------+------------+------------+--------------+--------------+------+-----+--------------+--------------------+--------------------+----------+
|age|workclass|fnlwgt|education|educational-num|marital-status|occupation|relationship| race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|label|      features|       rawPrediction|         probability|prediction|
+---+---------+------+---------+---------------+--------------+----------+------------+-----+------+------------+------------+--------------+--------------+------+-----+--------------+--------------------+--------------------+----------+
| 17|        ?| 27251|     11th|              7|       Widowed|         ?|   Own-child|White|  Male|           0|           0|            40| United-States| <=50K|   40|[0.0,0.0,40.0]|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|      40.0|
| 17|        ?| 34088|     12th|              8|

In [27]:
from pyspark.ml.classification import RandomForestClassifier
# create the trainer and set its parameters
nb3 = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)
# train the model
model3 = nb3.fit(train)
# select example rows to display.
predictions = model3.transform(test)
predictions.show(3)
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

+---+---------+------+---------+---------------+--------------+----------+------------+-----+------+------------+------------+--------------+--------------+------+-----+--------------+--------------------+--------------------+----------+
|age|workclass|fnlwgt|education|educational-num|marital-status|occupation|relationship| race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|label|      features|       rawPrediction|         probability|prediction|
+---+---------+------+---------+---------------+--------------+----------+------------+-----+------+------------+------------+--------------+--------------+------+-----+--------------+--------------------+--------------------+----------+
| 17|        ?| 27251|     11th|              7|       Widowed|         ?|   Own-child|White|  Male|           0|           0|            40| United-States| <=50K|   40|[0.0,0.0,40.0]|[0.0,5.3699924820...|[0.0,5.3699924820...|      40.0|
| 17|        ?| 34088|     12th|              8|

In [28]:
from pyspark.ml.classification import RandomForestClassifier
# create the trainer and set its parameters
nb3 = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100)
# train the model
model3 = nb3.fit(train)
# select example rows to display.
predictions = model3.transform(test)
predictions.show(3)
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

+---+---------+------+---------+---------------+--------------+----------+------------+-----+------+------------+------------+--------------+--------------+------+-----+--------------+--------------------+--------------------+----------+
|age|workclass|fnlwgt|education|educational-num|marital-status|occupation|relationship| race|gender|capital-gain|capital-loss|hours-per-week|native-country|income|label|      features|       rawPrediction|         probability|prediction|
+---+---------+------+---------+---------------+--------------+----------+------------+-----+------+------------+------------+--------------+--------------+------+-----+--------------+--------------------+--------------------+----------+
| 17|        ?| 27251|     11th|              7|       Widowed|         ?|   Own-child|White|  Male|           0|           0|            40| United-States| <=50K|   40|[0.0,0.0,40.0]|[0.0,0.0015785429...|[0.0,1.5785429419...|      40.0|
| 17|        ?| 34088|     12th|              8|