# PART 1 : Install Dependencies & Run Spark Session

In [130]:
#install pyspark
! pip install pyspark

import pyspark
from pyspark.ml.classification import LogisticRegression

from pyspark.sql.functions import when, col



In [131]:
#create a sparksession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("spark").getOrCreate()

# PART 2: Clone & Explore dataset

In [132]:
#clone the diabetes dataset from the github repository
# if you get error saying "Project-4 directory already exists, move to next cell"
! git clone https://github.com/elluis1001/Project-4/

fatal: destination path 'Project-4' already exists and is not an empty directory.


In [133]:
#check if the dataset exists
! ls /content/Project-4/Luis/Dataset/

diabetes_prediction_dataset.csv


In [134]:
#create spark dataframe
df_diabetes_data = spark.read.csv("/content/Project-4/Luis/Dataset/diabetes_prediction_dataset.csv", header=True, inferSchema=True)

In [135]:
#display the dataframe
df_diabetes_data.show()

+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|gender| age|hypertension|heart_disease|smoking_history|  bmi|HbA1c_level|blood_glucose_level|diabetes|
+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|Female|80.0|           0|            1|          never|25.19|        6.6|                140|       0|
|Female|54.0|           0|            0|        No Info|27.32|        6.6|                 80|       0|
|  Male|28.0|           0|            0|          never|27.32|        5.7|                158|       0|
|Female|36.0|           0|            0|        current|23.45|        5.0|                155|       0|
|  Male|76.0|           1|            1|        current|20.14|        4.8|                155|       0|
|Female|20.0|           0|            0|          never|27.32|        6.6|                 85|       0|
|Female|44.0|           0|            0|          never|19.31|  

In [136]:
#show amount of rows
df_diabetes_data.count()

100000

In [137]:
#print the schema
df_diabetes_data.printSchema()

root
 |-- gender: string (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- smoking_history: string (nullable = true)
 |-- bmi: double (nullable = true)
 |-- HbA1c_level: double (nullable = true)
 |-- blood_glucose_level: integer (nullable = true)
 |-- diabetes: integer (nullable = true)



In [138]:
#count the total no. of diabetic and non-diabetic class (values of 1 indicating the presence of diabetes and 0 indicating the absence of diabetes)
print((df_diabetes_data.count(), len(df_diabetes_data.columns)))
df_diabetes_data.groupBy('diabetes').count().show()

(100000, 9)
+--------+-----+
|diabetes|count|
+--------+-----+
|       1| 8500|
|       0|91500|
+--------+-----+



In [139]:
#count the total no. of gender types
print((df_diabetes_data.count(), len(df_diabetes_data.columns)))
df_diabetes_data.groupBy('gender').count().show()

(100000, 9)
+------+-----+
|gender|count|
+------+-----+
|Female|58552|
| Other|   18|
|  Male|41430|
+------+-----+



In [140]:
#check to see if there are any empty values in the 'gender' column
df_diabetes_data[df_diabetes_data['gender'] == '']

DataFrame[gender: string, age: double, hypertension: int, heart_disease: int, smoking_history: string, bmi: double, HbA1c_level: double, blood_glucose_level: int, diabetes: int]

In [141]:
#get the summary statistics
df_diabetes_data.describe().show()

+-------+------+-----------------+------------------+------------------+---------------+-----------------+------------------+-------------------+-------------------+
|summary|gender|              age|      hypertension|     heart_disease|smoking_history|              bmi|       HbA1c_level|blood_glucose_level|           diabetes|
+-------+------+-----------------+------------------+------------------+---------------+-----------------+------------------+-------------------+-------------------+
|  count|100000|           100000|            100000|            100000|         100000|           100000|            100000|             100000|             100000|
|   mean|  NULL|41.88585600000013|           0.07485|           0.03942|           NULL|27.32076709999422|5.5275069999983275|          138.05806|              0.085|
| stddev|  NULL|22.51683987161704|0.2631504702289171|0.1945930169980986|           NULL|6.636783416648357|1.0706720918835468|  40.70813604870383|0.27888308976661896|
|   

# PART 3: Data Cleaning & Preparation

In [142]:
#check for null values
for col in df_diabetes_data.columns:
  print(col + ":", df_diabetes_data[df_diabetes_data[col].isNull()].count())

gender: 0
age: 0
hypertension: 0
heart_disease: 0
smoking_history: 0
bmi: 0
HbA1c_level: 0
blood_glucose_level: 0
diabetes: 0


In [143]:
#look for the unnecessary values present
def count_zeros():
  columns_list = ["age", "bmi", "HbA1c_level", "blood_glucose_level"]
  for i in columns_list:
    print(i+":",df_diabetes_data[df_diabetes_data[i]==0].count())

In [144]:
count_zeros()

age: 0
bmi: 0
HbA1c_level: 0
blood_glucose_level: 0


In [145]:
#display the dataframe
df_diabetes_data.show()

+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|gender| age|hypertension|heart_disease|smoking_history|  bmi|HbA1c_level|blood_glucose_level|diabetes|
+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|Female|80.0|           0|            1|          never|25.19|        6.6|                140|       0|
|Female|54.0|           0|            0|        No Info|27.32|        6.6|                 80|       0|
|  Male|28.0|           0|            0|          never|27.32|        5.7|                158|       0|
|Female|36.0|           0|            0|        current|23.45|        5.0|                155|       0|
|  Male|76.0|           1|            1|        current|20.14|        4.8|                155|       0|
|Female|20.0|           0|            0|          never|27.32|        6.6|                 85|       0|
|Female|44.0|           0|            0|          never|19.31|  

In [146]:
#drop the 'other' rows in the gender columns
string_to_remove = "Other"
df_diabetes_data = df_diabetes_data[df_diabetes_data['Gender'] != string_to_remove]

In [147]:
#count the total no. of gender types
print((df_diabetes_data.count(), len(df_diabetes_data.columns)))
df_diabetes_data.groupBy('gender').count().show()

(99982, 9)
+------+-----+
|gender|count|
+------+-----+
|Female|58552|
|  Male|41430|
+------+-----+



In [148]:
#count the total no. of smoker/non-smoker types
print((df_diabetes_data.count(), len(df_diabetes_data.columns)))
df_diabetes_data.groupBy('smoking_history').count().show()

(99982, 9)
+---------------+-----+
|smoking_history|count|
+---------------+-----+
|    not current| 6439|
|         former| 9352|
|        No Info|35810|
|        current| 9286|
|          never|35092|
|           ever| 4003|
+---------------+-----+



In [149]:
#drop the 'other' rows in the gender columns
string_to_remove_1= "No Info"
df_diabetes_data = df_diabetes_data[df_diabetes_data['smoking_history'] != string_to_remove_1]

In [150]:
#count the total no. of smoker/non-smoker types
print((df_diabetes_data.count(), len(df_diabetes_data.columns)))
df_diabetes_data.groupBy('smoking_history').count().show()

(64172, 9)
+---------------+-----+
|smoking_history|count|
+---------------+-----+
|    not current| 6439|
|         former| 9352|
|        current| 9286|
|          never|35092|
|           ever| 4003|
+---------------+-----+



In [151]:
#count the total no. of gender types
print((df_diabetes_data.count(), len(df_diabetes_data.columns)))
df_diabetes_data.groupBy('gender').count().show()

(64172, 9)
+------+-----+
|gender|count|
+------+-----+
|Female|38852|
|  Male|25320|
+------+-----+



In [152]:
#assign in the 'gender'column 'Female' = 0, and 'Male' = 1
from pyspark.sql.functions import when, col
df_diabetes_data = df_diabetes_data.withColumn("gender",
    when(col("gender") == "Female", 0).
    when(col("gender") == "Male", 1).
    otherwise(col("gender"))
)
df_diabetes_data.show()

+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|gender| age|hypertension|heart_disease|smoking_history|  bmi|HbA1c_level|blood_glucose_level|diabetes|
+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|     0|80.0|           0|            1|          never|25.19|        6.6|                140|       0|
|     1|28.0|           0|            0|          never|27.32|        5.7|                158|       0|
|     0|36.0|           0|            0|        current|23.45|        5.0|                155|       0|
|     1|76.0|           1|            1|        current|20.14|        4.8|                155|       0|
|     0|20.0|           0|            0|          never|27.32|        6.6|                 85|       0|
|     0|44.0|           0|            0|          never|19.31|        6.5|                200|       1|
|     1|42.0|           0|            0|          never|33.64|  

In [153]:
#assign in the 'smoking_history': "never" = 0, "ever" = 1, "not current" = 2, "current" = 3, "former" = 4
df_diabetes_data = df_diabetes_data.withColumn("smoking_history",
    when(col("smoking_history") == "never", 0).
    when(col("smoking_history") == "ever", 1).
    when(col("smoking_history") == "not current", 2).
    when(col("smoking_history") == "current", 3).
    when(col("smoking_history") == "former", 4).
    otherwise(col("smoking_history"))
)
df_diabetes_data.show()

+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|gender| age|hypertension|heart_disease|smoking_history|  bmi|HbA1c_level|blood_glucose_level|diabetes|
+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|     0|80.0|           0|            1|              0|25.19|        6.6|                140|       0|
|     1|28.0|           0|            0|              0|27.32|        5.7|                158|       0|
|     0|36.0|           0|            0|              3|23.45|        5.0|                155|       0|
|     1|76.0|           1|            1|              3|20.14|        4.8|                155|       0|
|     0|20.0|           0|            0|              0|27.32|        6.6|                 85|       0|
|     0|44.0|           0|            0|              0|19.31|        6.5|                200|       1|
|     1|42.0|           0|            0|              0|33.64|  

# PART 4: Correlation Analysis & Feature Selection

In [154]:
# gender and smoking_history needs to be converted to float data type for model to work
df_diabetes_data = df_diabetes_data.withColumn("gender", col("gender").cast('float'))
df_diabetes_data = df_diabetes_data.withColumn("smoking_history", col("smoking_history").cast('float'))
df_diabetes_data.show()


+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|gender| age|hypertension|heart_disease|smoking_history|  bmi|HbA1c_level|blood_glucose_level|diabetes|
+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|   0.0|80.0|           0|            1|            0.0|25.19|        6.6|                140|       0|
|   1.0|28.0|           0|            0|            0.0|27.32|        5.7|                158|       0|
|   0.0|36.0|           0|            0|            3.0|23.45|        5.0|                155|       0|
|   1.0|76.0|           1|            1|            3.0|20.14|        4.8|                155|       0|
|   0.0|20.0|           0|            0|            0.0|27.32|        6.6|                 85|       0|
|   0.0|44.0|           0|            0|            0.0|19.31|        6.5|                200|       1|
|   1.0|42.0|           0|            0|            0.0|33.64|  

In [155]:
#find the correlation among the set of input & output variables
for i in df_diabetes_data.columns:
  print("Correlation to outcome for {} is {}".format(i, df_diabetes_data.stat.corr("diabetes",i)))

Correlation to outcome for gender is 0.05699689368565596
Correlation to outcome for age is 0.26084962459224337
Correlation to outcome for hypertension is 0.19222574901207254
Correlation to outcome for heart_disease is 0.16961397731730365
Correlation to outcome for smoking_history is 0.06472564826560573
Correlation to outcome for bmi is 0.20442115545137657
Correlation to outcome for HbA1c_level is 0.43889709468177335
Correlation to outcome for blood_glucose_level is 0.449697968864106
Correlation to outcome for diabetes is 1.0


In [156]:
#feature selection
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols = ['gender', 'age', 'hypertension', 'heart_disease',
                                         'smoking_history', 'bmi', 'HbA1c_level', 'blood_glucose_level'], outputCol='features')
output_data = assembler.transform(df_diabetes_data)

In [157]:
#print the schema
output_data.printSchema()

root
 |-- gender: float (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- smoking_history: float (nullable = true)
 |-- bmi: double (nullable = true)
 |-- HbA1c_level: double (nullable = true)
 |-- blood_glucose_level: integer (nullable = true)
 |-- diabetes: integer (nullable = true)
 |-- features: vector (nullable = true)



In [158]:
#display dataframe
output_data.show()

+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+--------------------+
|gender| age|hypertension|heart_disease|smoking_history|  bmi|HbA1c_level|blood_glucose_level|diabetes|            features|
+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+--------------------+
|   0.0|80.0|           0|            1|            0.0|25.19|        6.6|                140|       0|[0.0,80.0,0.0,1.0...|
|   1.0|28.0|           0|            0|            0.0|27.32|        5.7|                158|       0|[1.0,28.0,0.0,0.0...|
|   0.0|36.0|           0|            0|            3.0|23.45|        5.0|                155|       0|[0.0,36.0,0.0,0.0...|
|   1.0|76.0|           1|            1|            3.0|20.14|        4.8|                155|       0|[1.0,76.0,1.0,1.0...|
|   0.0|20.0|           0|            0|            0.0|27.32|        6.6|                 85|       0|(8,[1,5,6,7],[20....|


# PART 5: Split Dataset & Build the Model

In [159]:
#create final data
from pyspark.ml.classification import LogisticRegression

final_data = output_data.select('features','diabetes')

In [160]:
#print schema of final data
final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- diabetes: integer (nullable = true)



In [161]:
#split the dataset ; build the model
train, test = final_data.randomSplit([0.7, 0.3])
models = LogisticRegression(labelCol= 'diabetes')
model = models.fit(train)

In [162]:
#summary of the model
summary = model.summary
summary.predictions.describe().show()

+-------+-------------------+-------------------+
|summary|           diabetes|         prediction|
+-------+-------------------+-------------------+
|  count|              44999|              44999|
|   mean|0.11075801684481877|0.08275739460876909|
| stddev| 0.3138357330331359| 0.2755182302122191|
|    min|                0.0|                0.0|
|    max|                1.0|                1.0|
+-------+-------------------+-------------------+



# PART 6: Evaluate and Save the Model

In [163]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

predictions = model.evaluate(test)

In [164]:
predictions.predictions.show(100)

+--------------------+--------+--------------------+--------------------+----------+
|            features|diabetes|       rawPrediction|         probability|prediction|
+--------------------+--------+--------------------+--------------------+----------+
|(8,[1,5,6,7],[0.1...|       0|[6.66659070466195...|[0.99872888730703...|       0.0|
|(8,[1,5,6,7],[0.4...|       0|[6.41965140451192...|[0.99837342597758...|       0.0|
|(8,[1,5,6,7],[0.5...|       0|[5.89963629721055...|[0.99726704815053...|       0.0|
|(8,[1,5,6,7],[0.6...|       0|[6.70004466767949...|[0.99877065622437...|       0.0|
|(8,[1,5,6,7],[0.7...|       0|[6.75391814211413...|[0.99883505774933...|       0.0|
|(8,[1,5,6,7],[0.8...|       0|[7.26518320358277...|[0.99930101580871...|       0.0|
|(8,[1,5,6,7],[1.0...|       0|[7.77005573208879...|[0.99957798843231...|       0.0|
|(8,[1,5,6,7],[1.0...|       0|[7.55593907734568...|[0.99947727860187...|       0.0|
|(8,[1,5,6,7],[1.0...|       0|[11.9651117817629...|[0.9999936376

In [182]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol= 'rawPrediction', labelCol='diabetes')
evaluator.evaluate(model.transform(test))

0.9537557051036812

In [181]:
# save model
model.save("model")

Py4JJavaError: An error occurred while calling o2009.save.
: java.io.IOException: Path model already exists. To overwrite it, please use write.overwrite().save(path) for Scala and use write().overwrite().save(path) for Java and Python.
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:683)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


In [183]:
# load saved model back to the environment
from pyspark.ml.classification import LogisticRegressionModel

model = LogisticRegressionModel.load('model')

# PART 7: Prediction on New Data with the saved model

In [184]:
#create a new spark dataframe
test_df = spark.read.csv('/content/Project-4/Luis/Dataset/diabetes_prediction_dataset.csv', header=True, inferSchema=True)

In [185]:
#assign in the 'gender'column 'Female' = 0, and 'Male' = 1
from pyspark.sql.functions import when, col
test_df = test_df.withColumn("gender",
    when(col("gender") == "Female", 0).
    when(col("gender") == "Male", 1).
    otherwise(col("gender"))
)
test_df.show()

+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|gender| age|hypertension|heart_disease|smoking_history|  bmi|HbA1c_level|blood_glucose_level|diabetes|
+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|     0|80.0|           0|            1|          never|25.19|        6.6|                140|       0|
|     0|54.0|           0|            0|        No Info|27.32|        6.6|                 80|       0|
|     1|28.0|           0|            0|          never|27.32|        5.7|                158|       0|
|     0|36.0|           0|            0|        current|23.45|        5.0|                155|       0|
|     1|76.0|           1|            1|        current|20.14|        4.8|                155|       0|
|     0|20.0|           0|            0|          never|27.32|        6.6|                 85|       0|
|     0|44.0|           0|            0|          never|19.31|  

In [186]:
#drop the 'other' rows in the gender columns
string_to_remove = "Other"
test_df = test_df[test_df['Gender'] != string_to_remove]

In [187]:
#count the total no. of gender types
print((test_df.count(), len(test_df.columns)))
test_df.groupBy('gender').count().show()

(99982, 9)
+------+-----+
|gender|count|
+------+-----+
|     0|58552|
|     1|41430|
+------+-----+



In [188]:
#assign in the 'smoking_history': "never" = 0, "ever" = 1, "not current" = 2, "current" = 3, "former" = 4
test_df = test_df.withColumn("smoking_history",
    when(col("smoking_history") == "never", 0).
    when(col("smoking_history") == "ever", 1).
    when(col("smoking_history") == "not current", 2).
    when(col("smoking_history") == "current", 3).
    when(col("smoking_history") == "former", 4).
    otherwise(col("smoking_history"))
)
test_df.show()

+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|gender| age|hypertension|heart_disease|smoking_history|  bmi|HbA1c_level|blood_glucose_level|diabetes|
+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|     0|80.0|           0|            1|              0|25.19|        6.6|                140|       0|
|     0|54.0|           0|            0|        No Info|27.32|        6.6|                 80|       0|
|     1|28.0|           0|            0|              0|27.32|        5.7|                158|       0|
|     0|36.0|           0|            0|              3|23.45|        5.0|                155|       0|
|     1|76.0|           1|            1|              3|20.14|        4.8|                155|       0|
|     0|20.0|           0|            0|              0|27.32|        6.6|                 85|       0|
|     0|44.0|           0|            0|              0|19.31|  

In [189]:
#drop the 'other' rows in the gender columns
string_to_remove_1= "No Info"
test_df = test_df[test_df['smoking_history'] != string_to_remove_1]

In [190]:
#count the total no. of smoker/non-smoker types
print((test_df.count(), len(test_df.columns)))
test_df.groupBy('smoking_history').count().show()

(64172, 9)
+---------------+-----+
|smoking_history|count|
+---------------+-----+
|              3| 9286|
|              0|35092|
|              1| 4003|
|              4| 9352|
|              2| 6439|
+---------------+-----+



In [191]:
# gender and smoking_history needs to be converted to float data type for model to work
test_df = test_df.withColumn("gender", col("gender").cast('float'))
test_df = test_df.withColumn("smoking_history", col("smoking_history").cast('float'))
test_df.show()

+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|gender| age|hypertension|heart_disease|smoking_history|  bmi|HbA1c_level|blood_glucose_level|diabetes|
+------+----+------------+-------------+---------------+-----+-----------+-------------------+--------+
|   0.0|80.0|           0|            1|            0.0|25.19|        6.6|                140|       0|
|   1.0|28.0|           0|            0|            0.0|27.32|        5.7|                158|       0|
|   0.0|36.0|           0|            0|            3.0|23.45|        5.0|                155|       0|
|   1.0|76.0|           1|            1|            3.0|20.14|        4.8|                155|       0|
|   0.0|20.0|           0|            0|            0.0|27.32|        6.6|                 85|       0|
|   0.0|44.0|           0|            0|            0.0|19.31|        6.5|                200|       1|
|   1.0|42.0|           0|            0|            0.0|33.64|  

In [192]:
#print the schema
test_df.printSchema()

root
 |-- gender: float (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- smoking_history: float (nullable = true)
 |-- bmi: double (nullable = true)
 |-- HbA1c_level: double (nullable = true)
 |-- blood_glucose_level: integer (nullable = true)
 |-- diabetes: integer (nullable = true)



In [193]:
#create an additional feature merged column
test_data = assembler.transform(test_df)

In [194]:
#print the schema
test_data.printSchema()

root
 |-- gender: float (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- smoking_history: float (nullable = true)
 |-- bmi: double (nullable = true)
 |-- HbA1c_level: double (nullable = true)
 |-- blood_glucose_level: integer (nullable = true)
 |-- diabetes: integer (nullable = true)
 |-- features: vector (nullable = true)



In [195]:
#use model to make predictions
results = model.transform(test_data)
results.printSchema()

root
 |-- gender: float (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- smoking_history: float (nullable = true)
 |-- bmi: double (nullable = true)
 |-- HbA1c_level: double (nullable = true)
 |-- blood_glucose_level: integer (nullable = true)
 |-- diabetes: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [196]:
#display the predictions
results.select('features','prediction').show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|[0.0,80.0,0.0,1.0...|       0.0|
|[1.0,28.0,0.0,0.0...|       0.0|
|[0.0,36.0,0.0,0.0...|       0.0|
|[1.0,76.0,1.0,1.0...|       0.0|
|(8,[1,5,6,7],[20....|       0.0|
|(8,[1,5,6,7],[44....|       0.0|
|[1.0,42.0,0.0,0.0...|       0.0|
|(8,[1,5,6,7],[32....|       0.0|
|(8,[1,5,6,7],[53....|       0.0|
|[0.0,54.0,0.0,0.0...|       0.0|
|[0.0,78.0,0.0,0.0...|       0.0|
|(8,[1,5,6,7],[67....|       0.0|
|[1.0,15.0,0.0,0.0...|       0.0|
|(8,[1,5,6,7],[42....|       0.0|
|[1.0,37.0,0.0,0.0...|       0.0|
|[1.0,40.0,0.0,0.0...|       0.0|
|(8,[1,5,6,7],[69....|       0.0|
|[0.0,72.0,0.0,1.0...|       0.0|
|[1.0,30.0,0.0,0.0...|       0.0|
|[1.0,67.0,0.0,1.0...|       1.0|
+--------------------+----------+
only showing top 20 rows

