In [30]:
import findspark 
findspark.init("/Users/valentinaporcu/spark/spark-2.4.1-bin-hadoop2.7")
import pyspark 
from pyspark.sql import DataFrameNaFunctions 
from pyspark.sql.functions import lit 
from pyspark.ml.feature import StringIndexer  
from pyspark.ml import Pipeline 
from pyspark.sql import SparkSession
from pyspark.sql import functions
import pandas as pd
import numpy as np

In [31]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("regressione logistica") \
    .config("spark.executor.memory", "6gb") \
    .getOrCreate()

In [32]:
df = spark.read.csv('pid.csv', inferSchema = True, header = True)

In [33]:
df.show()

+---+--------+-------+--------+-------+-------+----+--------+---+--------+
|_c0|pregnant|glucose|pressure|triceps|insulin|mass|pedigree|age|diabetes|
+---+--------+-------+--------+-------+-------+----+--------+---+--------+
|  1|       6|    148|      72|     35|      0|33.6|   0.627| 50|     pos|
|  2|       1|     85|      66|     29|      0|26.6|   0.351| 31|     neg|
|  3|       8|    183|      64|      0|      0|23.3|   0.672| 32|     pos|
|  4|       1|     89|      66|     23|     94|28.1|   0.167| 21|     neg|
|  5|       0|    137|      40|     35|    168|43.1|   2.288| 33|     pos|
|  6|       5|    116|      74|      0|      0|25.6|   0.201| 30|     neg|
|  7|       3|     78|      50|     32|     88|31.0|   0.248| 26|     pos|
|  8|      10|    115|       0|      0|      0|35.3|   0.134| 29|     neg|
|  9|       2|    197|      70|     45|    543|30.5|   0.158| 53|     pos|
| 10|       8|    125|      96|      0|      0| 0.0|   0.232| 54|     pos|
| 11|       4|    110|   

In [34]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)

In [35]:
indexer = StringIndexer(inputCol="diabetes", outputCol="diabetesIndex")
indexed = indexer.fit(df).transform(df)

In [36]:
indexed.show()

+---+--------+-------+--------+-------+-------+----+--------+---+--------+-------------+
|_c0|pregnant|glucose|pressure|triceps|insulin|mass|pedigree|age|diabetes|diabetesIndex|
+---+--------+-------+--------+-------+-------+----+--------+---+--------+-------------+
|  1|       6|    148|      72|     35|      0|33.6|   0.627| 50|     pos|          1.0|
|  2|       1|     85|      66|     29|      0|26.6|   0.351| 31|     neg|          0.0|
|  3|       8|    183|      64|      0|      0|23.3|   0.672| 32|     pos|          1.0|
|  4|       1|     89|      66|     23|     94|28.1|   0.167| 21|     neg|          0.0|
|  5|       0|    137|      40|     35|    168|43.1|   2.288| 33|     pos|          1.0|
|  6|       5|    116|      74|      0|      0|25.6|   0.201| 30|     neg|          0.0|
|  7|       3|     78|      50|     32|     88|31.0|   0.248| 26|     pos|          1.0|
|  8|      10|    115|       0|      0|      0|35.3|   0.134| 29|     neg|          0.0|
|  9|       2|    197

In [37]:
assembler = VectorAssembler(inputCols=['pregnant',
                                       'glucose', 'pressure',
                                       'triceps','insulin',
                                       'mass',
                                       'pedigree','age'],outputCol='features')

In [38]:
output = assembler.transform(indexed)

In [39]:
output.show()

+---+--------+-------+--------+-------+-------+----+--------+---+--------+-------------+--------------------+
|_c0|pregnant|glucose|pressure|triceps|insulin|mass|pedigree|age|diabetes|diabetesIndex|            features|
+---+--------+-------+--------+-------+-------+----+--------+---+--------+-------------+--------------------+
|  1|       6|    148|      72|     35|      0|33.6|   0.627| 50|     pos|          1.0|[6.0,148.0,72.0,3...|
|  2|       1|     85|      66|     29|      0|26.6|   0.351| 31|     neg|          0.0|[1.0,85.0,66.0,29...|
|  3|       8|    183|      64|      0|      0|23.3|   0.672| 32|     pos|          1.0|[8.0,183.0,64.0,0...|
|  4|       1|     89|      66|     23|     94|28.1|   0.167| 21|     neg|          0.0|[1.0,89.0,66.0,23...|
|  5|       0|    137|      40|     35|    168|43.1|   2.288| 33|     pos|          1.0|[0.0,137.0,40.0,3...|
|  6|       5|    116|      74|      0|      0|25.6|   0.201| 30|     neg|          0.0|[5.0,116.0,74.0,0...|
|  7|     

In [40]:
final_data = output.select('features', 'diabetesIndex')

In [41]:
train, test = final_data.randomSplit([0.7,0.3], seed = 12345)

In [43]:
train.show()

+--------------------+-------------+
|            features|diabetesIndex|
+--------------------+-------------+
|(8,[0,1,6,7],[2.0...|          0.0|
|(8,[0,1,6,7],[2.0...|          0.0|
|(8,[0,1,6,7],[3.0...|          0.0|
|(8,[0,1,6,7],[6.0...|          0.0|
|(8,[1,5,6,7],[73....|          0.0|
|(8,[1,5,6,7],[119...|          1.0|
|(8,[1,5,6,7],[131...|          1.0|
|(8,[1,5,6,7],[138...|          1.0|
|(8,[1,5,6,7],[141...|          1.0|
|(8,[1,5,6,7],[145...|          1.0|
|(8,[1,5,6,7],[167...|          1.0|
|[0.0,74.0,52.0,10...|          0.0|
|[0.0,78.0,88.0,29...|          0.0|
|[0.0,84.0,82.0,31...|          0.0|
|[0.0,86.0,68.0,32...|          0.0|
|[0.0,91.0,80.0,0....|          0.0|
|[0.0,93.0,60.0,0....|          0.0|
|[0.0,93.0,60.0,25...|          0.0|
|[0.0,93.0,100.0,3...|          0.0|
|[0.0,94.0,70.0,27...|          0.0|
+--------------------+-------------+
only showing top 20 rows



In [44]:
from pyspark.ml.classification import LogisticRegression

In [45]:
lr_model = LogisticRegression(labelCol='diabetesIndex')

In [46]:
fitted_model = lr_model.fit(train)

In [47]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [48]:
pred_and_labels = fitted_model.evaluate(test)

In [49]:
pred_and_labels.predictions.show()

+--------------------+-------------+--------------------+--------------------+----------+
|            features|diabetesIndex|       rawPrediction|         probability|prediction|
+--------------------+-------------+--------------------+--------------------+----------+
|(8,[0,1,6,7],[7.0...|          0.0|[2.87683389669159...|[0.94668930038016...|       0.0|
|(8,[0,1,6,7],[10....|          1.0|[2.12595185037399...|[0.89340009077817...|       0.0|
|(8,[1,5,6,7],[99....|          0.0|[1.68105803114878...|[0.84304458105399...|       0.0|
|(8,[1,5,6,7],[117...|          0.0|[-0.6658045790425...|[0.33943690131845...|       1.0|
|(8,[1,6,7],[94.0,...|          0.0|[4.09197832736382...|[0.98356835886160...|       0.0|
|[0.0,57.0,60.0,0....|          0.0|[3.29403668991684...|[0.96422366658309...|       0.0|
|[0.0,67.0,76.0,0....|          0.0|[1.98932326476334...|[0.87967152382576...|       0.0|
|[0.0,84.0,64.0,22...|          0.0|[2.16811174505128...|[0.89734916297012...|       0.0|
|[0.0,91.0

In [50]:
predictionAndLabels = pred_and_labels.predictions.select('diabetesIndex','prediction')

In [51]:
predictionAndLabels.show()

+-------------+----------+
|diabetesIndex|prediction|
+-------------+----------+
|          0.0|       0.0|
|          1.0|       0.0|
|          0.0|       0.0|
|          0.0|       1.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
|          1.0|       0.0|
|          0.0|       0.0|
|          0.0|       0.0|
+-------------+----------+
only showing top 20 rows



In [52]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', 
                                          labelCol='diabetesIndex')

In [53]:
acc = evaluator.evaluate(predictionAndLabels)

In [54]:
acc

0.7488644688644689

In [25]:
spark.stop()