In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier

In [3]:
# 1. Load Data
# The dataset used in this project was obtained from the Big Data Processing course.
spark = SparkSession.builder.getOrCreate()

df_train = spark.read.option("inferschema", "true").csv("Classification_Train.csv", header = True)
df_test = spark.read.option("inferschema", "true").csv("Classification_Test.csv", header = True)

df_train.show()
df_test.show()

+-------------------+------+------+---------------+---------+-------+-------------+---------+
|               Name|Gender|Height|Education Level|Eye Color|Married|Salary Income|Depressed|
+-------------------+------+------+---------------+---------+-------+-------------+---------+
|      Sax Tesseyman|Female|   174|   Intermediate|     Blue|    Yes|     85000000|       No|
|        Niels Greet|  Male|   165|   Intermediate|    Black|     No|     14000000|       No|
|     Minetta Santry|Female|   160|            Low|    Black|     No|    148000000|      Yes|
|     Sherm Gossipin|Female|   144|           High|    Black|     No|     50000000|      Yes|
|   Cathie Blackmuir|  Male|   168|   Intermediate|    Black|    Yes|    101000000|       No|
|     Early Cardenas|  Male|   151|            Low|    Black|    Yes|    145000000|      Yes|
|   Willard Pendrick|Female|   141|   Intermediate|    Brown|     No|     55000000|      Yes|
|   Penelopa Spensly|Female|   144|   Intermediate|     Blue

In [4]:
# 2. Select Features
df_train = df_train.select("Education Level", "Married", "Salary Income", "Depressed")
df_test = df_test.select("Education Level", "Married", "Salary Income", "Depressed")

df_train.show()
df_test.show()

+---------------+-------+-------------+---------+
|Education Level|Married|Salary Income|Depressed|
+---------------+-------+-------------+---------+
|   Intermediate|    Yes|     85000000|       No|
|   Intermediate|     No|     14000000|       No|
|            Low|     No|    148000000|      Yes|
|           High|     No|     50000000|      Yes|
|   Intermediate|    Yes|    101000000|       No|
|            Low|    Yes|    145000000|      Yes|
|   Intermediate|     No|     55000000|      Yes|
|   Intermediate|    Yes|     51000000|       No|
|           High|     No|     97000000|      Yes|
|            Low|     No|     41000000|      Yes|
|           High|    Yes|     27000000|       No|
|           High|    Yes|      3000000|       No|
|           High|     No|      9000000|      Yes|
|   Intermediate|     No|     12000000|      Yes|
|   Intermediate|    Yes|     81000000|       No|
|           High|     No|     53000000|      Yes|
|            Low|     No|     10000000|      Yes|


In [5]:
# 3. Data Preprocessing
df_train = df_train.na.drop()
df_test = df_test.na.drop()

df_train.show()
df_test.show()

+---------------+-------+-------------+---------+
|Education Level|Married|Salary Income|Depressed|
+---------------+-------+-------------+---------+
|   Intermediate|    Yes|     85000000|       No|
|   Intermediate|     No|     14000000|       No|
|            Low|     No|    148000000|      Yes|
|           High|     No|     50000000|      Yes|
|   Intermediate|    Yes|    101000000|       No|
|            Low|    Yes|    145000000|      Yes|
|   Intermediate|     No|     55000000|      Yes|
|   Intermediate|    Yes|     51000000|       No|
|           High|     No|     97000000|      Yes|
|            Low|     No|     41000000|      Yes|
|           High|    Yes|     27000000|       No|
|           High|    Yes|      3000000|       No|
|           High|     No|      9000000|      Yes|
|   Intermediate|     No|     12000000|      Yes|
|   Intermediate|    Yes|     81000000|       No|
|           High|     No|     53000000|      Yes|
|            Low|     No|     10000000|      Yes|


In [6]:
# 4. Transform Data
df_train = df_train.withColumn("Education Level", when(df_train["Education Level"] == "Low", 0).
                                                  when(df_train["Education Level"] == "High", 1).
                                                  when(df_train["Education Level"] == "Intermediate", 2))
df_train = df_train.withColumn("Married", when(df_train["Married"] == "No", 0).
                                          when(df_train["Married"] == "Yes", 1))
df_train = df_train.withColumn("Depressed", when(df_train["Depressed"] == "No", 0).
                                            when(df_train["Depressed"] == "Yes", 1))

df_test = df_test.withColumn("Education Level", when(df_test["Education Level"] == "Low", 0).
                                                  when(df_test["Education Level"] == "High", 1).
                                                  when(df_test["Education Level"] == "Intermediate", 2))
df_test = df_test.withColumn("Married", when(df_test["Married"] == "No", 0).
                                          when(df_test["Married"] == "Yes", 1))
df_test = df_test.withColumn("Depressed", when(df_test["Depressed"] == "No", 0).
                                            when(df_test["Depressed"] == "Yes", 1))

df_train.show()
df_test.show()

+---------------+-------+-------------+---------+
|Education Level|Married|Salary Income|Depressed|
+---------------+-------+-------------+---------+
|              2|      1|     85000000|        0|
|              2|      0|     14000000|        0|
|              0|      0|    148000000|        1|
|              1|      0|     50000000|        1|
|              2|      1|    101000000|        0|
|              0|      1|    145000000|        1|
|              2|      0|     55000000|        1|
|              2|      1|     51000000|        0|
|              1|      0|     97000000|        1|
|              0|      0|     41000000|        1|
|              1|      1|     27000000|        0|
|              1|      1|      3000000|        0|
|              1|      0|      9000000|        1|
|              2|      0|     12000000|        1|
|              2|      1|     81000000|        0|
|              1|      0|     53000000|        1|
|              0|      0|     10000000|        1|


In [7]:
# 5. Normalization

# training
cols = df_train.columns
cols.remove("Depressed")
df_train = VectorAssembler(inputCols = cols, outputCol = "Features").transform(df_train)

scaler = StandardScaler(inputCol = "Features", outputCol = "Scaled_Features")
df_train = scaler.fit(df_train).transform(df_train)

# testing
cols = df_test.columns
cols.remove("Depressed")
df_test = VectorAssembler(inputCols = cols, outputCol = "Features").transform(df_test)

scaler = StandardScaler(inputCol = "Features", outputCol = "Scaled_Features")
df_test = scaler.fit(df_test).transform(df_test)

df_train.select("Depressed", "Scaled_Features").show(10, False)
df_test.select("Depressed", "Scaled_Features").show(10, False)

+---------+----------------------------------------------------------+
|Depressed|Scaled_Features                                           |
+---------+----------------------------------------------------------+
|0        |[2.3989417624346427,2.0004291810120303,2.0520111267842664]|
|0        |[2.3989417624346427,0.0,0.3379783032350556]               |
|1        |[0.0,0.0,3.5729134913420166]                              |
|1        |[1.1994708812173214,0.0,1.2070653686966273]               |
|0        |[2.3989417624346427,2.0004291810120303,2.438272044767187] |
|1        |[0.0,2.0004291810120303,3.500489569220219]                |
|1        |[2.3989417624346427,0.0,1.32777190556629]                 |
|0        |[2.3989417624346427,2.0004291810120303,1.2312066760705598]|
|1        |[1.1994708812173214,0.0,2.341706815271457]                |
|1        |[0.0,0.0,0.9897936023312343]                              |
+---------+----------------------------------------------------------+
only s

In [8]:
# 6. Generate Model - Random Forest
model = RandomForestClassifier(featuresCol="Scaled_Features", labelCol="Depressed", numTrees=100).fit(df_train)

In [10]:
# 7. Model Testing and Evaluation - Random Forest
prediction = model.transform(df_test)

evaluator = BinaryClassificationEvaluator(labelCol="Depressed")
print("Random Forest Accuracy: {}%".format(evaluator.evaluate(prediction) * 100))

prediction.select("Scaled_Features", "Depressed", "prediction").show(20, False)

Random Forest Accuracy: 86.96379392960858%
+--------------------------------------------------------+---------+----------+
|Scaled_Features                                         |Depressed|prediction|
+--------------------------------------------------------+---------+----------+
|[0.0,0.0,1.8039685480507293]                            |1        |1.0       |
|[1.203601356181468,1.999503936496242,0.7069606472090696]|0        |0.0       |
|[2.407202712362936,1.999503936496242,3.2666457491729424]|0        |0.0       |
|[2.407202712362936,0.0,0.31691339357647946]             |1        |1.0       |
|[0.0,1.999503936496242,0.17064567346425819]             |1        |0.0       |
|[2.407202712362936,0.0,2.5840630553159096]              |1        |1.0       |
|[1.203601356181468,0.0,0.8532283673212908]              |1        |1.0       |
|[1.203601356181468,1.999503936496242,3.022866215652573] |0        |0.0       |
|[0.0,0.0,0.17064567346425819]                           |1        |1.0      