In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.stat import Correlation
import pyspark.sql.functions as F

In [2]:
 spark = SparkSession.builder.getOrCreate()

In [3]:
df = spark.read.csv("/content/pima.csv", inferSchema=True,
                   header=True)

In [4]:
df.show()

+----+----+----+----+----+----+-----+---+-----+
|preg|plas|pres|skin|test|mass| pedi|age|class|
+----+----+----+----+----+----+-----+---+-----+
|   6| 148|  72|  35|   0|33.6|0.627| 50|    1|
|   1|  85|  66|  29|   0|26.6|0.351| 31|    0|
|   8| 183|  64|   0|   0|23.3|0.672| 32|    1|
|   1|  89|  66|  23|  94|28.1|0.167| 21|    0|
|   0| 137|  40|  35| 168|43.1|2.288| 33|    1|
|   5| 116|  74|   0|   0|25.6|0.201| 30|    0|
|   3|  78|  50|  32|  88|31.0|0.248| 26|    1|
|  10| 115|   0|   0|   0|35.3|0.134| 29|    0|
|   2| 197|  70|  45| 543|30.5|0.158| 53|    1|
|   8| 125|  96|   0|   0| 0.0|0.232| 54|    1|
|   4| 110|  92|   0|   0|37.6|0.191| 30|    0|
|  10| 168|  74|   0|   0|38.0|0.537| 34|    1|
|  10| 139|  80|   0|   0|27.1|1.441| 57|    0|
|   1| 189|  60|  23| 846|30.1|0.398| 59|    1|
|   5| 166|  72|  19| 175|25.8|0.587| 51|    1|
|   7| 100|   0|   0|   0|30.0|0.484| 32|    1|
|   0| 118|  84|  47| 230|45.8|0.551| 31|    1|
|   7| 107|  74|   0|   0|29.6|0.254| 31

In [5]:
df.select("class").show()

+-----+
|class|
+-----+
|    1|
|    0|
|    1|
|    0|
|    1|
|    0|
|    1|
|    0|
|    1|
|    1|
|    0|
|    1|
|    0|
|    1|
|    1|
|    1|
|    1|
|    1|
|    0|
|    1|
+-----+
only showing top 20 rows



In [6]:
df.count()

768

In [7]:
len(df.columns)

9

In [8]:
df.printSchema()

root
 |-- preg: integer (nullable = true)
 |-- plas: integer (nullable = true)
 |-- pres: integer (nullable = true)
 |-- skin: integer (nullable = true)
 |-- test: integer (nullable = true)
 |-- mass: double (nullable = true)
 |-- pedi: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- class: integer (nullable = true)



In [9]:
df.describe().show()

+-------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|summary|              preg|             plas|              pres|              skin|              test|              mass|              pedi|               age|             class|
+-------+------------------+-----------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+
|  count|               768|              768|               768|               768|               768|               768|               768|               768|               768|
|   mean|3.8450520833333335|     120.89453125|       69.10546875|20.536458333333332| 79.79947916666667|31.992578124999977|0.4718763020833327|33.240885416666664|0.3489583333333333|
| stddev|  3.36957806269887|31.97261819513622|19.355807170644777|15.952217567727642|115.244002351338

In [10]:
df.head(5)

[Row(preg=6, plas=148, pres=72, skin=35, test=0, mass=33.6, pedi=0.627, age=50, class=1),
 Row(preg=1, plas=85, pres=66, skin=29, test=0, mass=26.6, pedi=0.351, age=31, class=0),
 Row(preg=8, plas=183, pres=64, skin=0, test=0, mass=23.3, pedi=0.672, age=32, class=1),
 Row(preg=1, plas=89, pres=66, skin=23, test=94, mass=28.1, pedi=0.167, age=21, class=0),
 Row(preg=0, plas=137, pres=40, skin=35, test=168, mass=43.1, pedi=2.288, age=33, class=1)]

In [11]:
df.groupBy('class').count().show()

+-----+-----+
|class|count|
+-----+-----+
|    1|  268|
|    0|  500|
+-----+-----+



In [12]:
df.groupBy('age').count().show()

+---+-----+
|age|count|
+---+-----+
| 31|   24|
| 65|    3|
| 53|    5|
| 34|   14|
| 81|    1|
| 28|   35|
| 26|   33|
| 27|   32|
| 44|    8|
| 22|   72|
| 47|    6|
| 52|    8|
| 40|   13|
| 57|    5|
| 54|    6|
| 48|    5|
| 64|    1|
| 41|   22|
| 43|   13|
| 37|   19|
+---+-----+
only showing top 20 rows



In [13]:
df.groupBy('mass').count().show()

+----+-----+
|mass|count|
+----+-----+
|26.7|    1|
|37.1|    2|
|25.1|    3|
|45.3|    3|
|24.7|    5|
|32.3|    3|
| 0.0|   11|
|43.3|    5|
|26.4|    3|
|23.8|    2|
|24.9|    1|
|37.4|    3|
|36.2|    1|
|35.6|    2|
|47.9|    2|
|49.7|    1|
|31.1|    1|
|38.5|    6|
|26.6|    4|
|23.9|    2|
+----+-----+
only showing top 20 rows



In [14]:
from pyspark.ml.feature import VectorAssembler

In [15]:
df.columns

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

In [16]:
assembler= VectorAssembler(inputCols=['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age'],
                           outputCol='features')
assembler

VectorAssembler_46d7b44f7fdd

In [17]:
output= assembler.transform(df)
output

DataFrame[preg: int, plas: int, pres: int, skin: int, test: int, mass: double, pedi: double, age: int, class: int, features: vector]

In [18]:
output.select('features','class').show(5)

+--------------------+-----+
|            features|class|
+--------------------+-----+
|[6.0,148.0,72.0,3...|    1|
|[1.0,85.0,66.0,29...|    0|
|[8.0,183.0,64.0,0...|    1|
|[1.0,89.0,66.0,23...|    0|
|[0.0,137.0,40.0,3...|    1|
+--------------------+-----+
only showing top 5 rows



In [19]:
df.columns

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

In [20]:
model_df=output.select(['features','class'])

In [21]:
training_df,test_df=model_df.randomSplit([0.70,0.30])
print(training_df.count())

517


In [22]:
print(test_df.count())

251


In [23]:
from pyspark.ml.classification import RandomForestClassifier

In [24]:
rf_classifier=RandomForestClassifier(labelCol='class',numTrees=50).fit(training_df)

In [25]:
# training results
rf_predictions=rf_classifier.transform(test_df)
rf_predictions.show()

+--------------------+-----+--------------------+--------------------+----------+
|            features|class|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|(8,[0,1,6,7],[2.0...|    0|[48.6128640999314...|[0.97225728199862...|       0.0|
|(8,[0,1,6,7],[2.0...|    0|[48.5861321722206...|[0.97172264344441...|       0.0|
|(8,[0,1,6,7],[6.0...|    0|[41.7913135399993...|[0.83582627079998...|       0.0|
|(8,[1,5,6,7],[119...|    1|[36.1167331940541...|[0.72233466388108...|       0.0|
|(8,[1,5,6,7],[138...|    1|[20.1979125951689...|[0.40395825190337...|       1.0|
|(8,[1,5,6,7],[141...|    1|[22.6457042317468...|[0.45291408463493...|       1.0|
|(8,[1,5,6,7],[167...|    1|[13.0758487520297...|[0.26151697504059...|       1.0|
|[0.0,78.0,88.0,29...|    0|[44.4752438213714...|[0.88950487642742...|       0.0|
|[0.0,84.0,82.0,31...|    0|[44.2581430416616...|[0.88516286083323...|       0.0|
|[0.0,95.0,85.0,

In [26]:
rf_predictions.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|  181|
|       1.0|   70|
+----------+-----+



In [27]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [28]:
rf_accuracy=MulticlassClassificationEvaluator(labelCol='class',
                                              metricName='accuracy').evaluate(rf_predictions)
print(rf_accuracy)

0.7131474103585658


In [29]:
#precision
rf_precision=MulticlassClassificationEvaluator(labelCol='class',metricName='weightedPrecision').evaluate(rf_predictions)
print(rf_precision)

0.7071546489653069


In [30]:
# AUC
rf_auc=BinaryClassificationEvaluator(labelCol='class').evaluate(rf_predictions)
print(rf_auc)

0.8041666666666668


In [31]:
# feature importance
rf_classifier.featureImportances

SparseVector(8, {0: 0.0747, 1: 0.3596, 2: 0.0623, 3: 0.0447, 4: 0.0428, 5: 0.1735, 6: 0.0841, 7: 0.1584})