<a href="https://colab.research.google.com/github/garthajon/DataScienceColabRepo/blob/main/diabetes_binary_SVM_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This workbook looks at the Machine Learning Model called the Support Vector Machine. It is based on the following blog: https://analytics4all.org/2020/05/06/python-support-vector-machine-svm/



In [20]:
!pip install -q pyspark==3.3.0 spark-nlp==4.3.2

In [2]:
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m47.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.4.2


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
#from sklearn.metrics import metrics
#from sklearn.metrics import confusion_matrix

In [22]:
# not only to we import the previously installed spark NLP but we also start a spark nlp session
import sparknlp
# start an apache spark nlp instance here
# which starts an apached spark engine session
spark = sparknlp.start()
# params =>> gpu=False

# and print out the details of spark NLP in use and the version of
print("Spark NLP version", sparknlp.version())
print("Apache Spark version:", spark.version)

spark

Spark NLP version 4.3.2
Apache Spark version: 3.3.0


Next, let’s look at the data set. This is the Pima Indians Diabetes data set. It is a publicly available data set consisting of 768 records. Columns are as follows:

Number of times pregnant.
Plasma glucose concentration a 2 hours in an oral glucose tolerance test.
Diastolic blood pressure (mm Hg).
Triceps skinfold thickness (mm).
2-Hour serum insulin (mu U/ml).
Body mass index (weight in kg/(height in m)^2).
Diabetes pedigree function.
Age (years).
Class variable (0 or 1).

In [23]:
# to get the link to the raw data you have to click on 'raw' in the dataset preview in your online repository
# https://raw.githubusercontent.com/garthajon/DataScienceColabRepo/main/pima_indians.csv (so click on the 'raw' button here)
# to get the 'raw data' url for your dataset
url = "https://raw.githubusercontent.com/garthajon/DataScienceColabRepo/main/pima_indians.csv"
from pyspark import SparkFiles
spark.sparkContext.addFile(url)
path  = SparkFiles.get('pima_indians.csv')
print (path)
df = spark.read.csv('file:///' + path, header=True, inferSchema= True, sep = ',')
df.show(n=5)

/tmp/spark-ae6f0365-f523-422d-9709-bafc8ad78d82/userFiles-5a8d92c5-40df-40a0-ab96-d7b90f6d520a/pima_indians.csv
+-------------+---+-------+------------+---+----+-----+---+-----+
|Time_Pregnant|PGC|Dias_BP|Tri_SkinFold| SI| BMI|  DPF|Age|Class|
+-------------+---+-------+------------+---+----+-----+---+-----+
|            6|148|     72|          35|  0|33.6|0.627| 50|    1|
|            1| 85|     66|          29|  0|26.6|0.351| 31|    0|
|            8|183|     64|           0|  0|23.3|0.672| 32|    1|
|            1| 89|     66|          23| 94|28.1|0.167| 21|    0|
|            0|137|     40|          35|168|43.1|2.288| 33|    1|
+-------------+---+-------+------------+---+----+-----+---+-----+
only showing top 5 rows



In [24]:
import numpy as np
y = np.array(df.select('Class').collect()).flatten()
X = np.array(df.select('Time_Pregnant','PGC','Dias_BP','Tri_Skinfold','SI','BMI','DPF','Age').collect())
#id_array = np.array(df.select("id").collect())
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33)

In [25]:
model =SVC()
model.fit(X_train, y_train)

In [28]:
y_pred = model.predict(X_test)
#y_pred

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0])

In [36]:
from pyspark.sql.types import StructType, StructField, IntegerType
from pyspark.sql.functions import monotonically_increasing_id, col,count,expr, sum as sql_sum

# Create a list of tuples combining elements from both arrays

#ytest are the known classification outputs and ypred are the predicted outputs based from x_test input
# both predictions and actuals test combined into a two dimensional array
# Convert to a two-dimensional array
#combined_array = np.stack((y_pred, y_test), axis=0)
#print (combined_array)
df_predicted = spark.createDataFrame([(int(x),) for x in y_pred], ["Predicted"])
# Add an auto-incremental index column
df_predicted = df_predicted.withColumn("index", monotonically_increasing_id())
#df_predicted.show()

df_actuals = spark.createDataFrame([(int(x),) for x in y_test], ["actuals"])
df_actuals = df_actuals.withColumn("index", monotonically_increasing_id())
#df_actuals.show()

# Perform inner join
modeloutput = df_actuals.join(df_predicted, on="index", how="inner")
#modeloutput = spark.createDataFrame(zip(y_pred, y_test), ["Predicted", "Actuals"])
#modeloutput.show()
#combined_array = np.concatenate((y_pred, y_test), axis=0)
#modeloutput = spark.createDataFrame(combined_array.tolist(), ["Predicted", "Actuals"])

# Aggregate sum of binary columns
result = modeloutput.select(
    sql_sum(col("predicted")).alias("sum_predicted"),
    sql_sum(col("actuals")).alias("sum_actual"),
    count(col("actuals")).alias("count_all")
)

# Show the result
#result.show()

# Add a new column with binary accuracy using a CASE statement
accuracy_df = modeloutput.withColumn("accuracy", expr("CASE WHEN actuals = predicted THEN 1 ELSE 0 END"))

# Show the new DataFrame
#accuracy_df.show()


accuracy_summary = accuracy_df.select(
    sql_sum(col("accuracy")).alias("sum_accuracy"),
    count(col("actuals")).alias("count_all")
)

# Show the result
accuracy_summary.show()


#modeloutput

# Create DataFrame from the list of dictionaries and schema
#df = spark.createDataFrame(data_dicts, schema)




# Create DataFrame from the combined data and schema
#df = spark.createDataFrame(combined_data, schema)

# Show the DataFrame
##df.show()

+------------+---------+
|sum_accuracy|count_all|
+------------+---------+
|         191|      254|
+------------+---------+

