In [1]:
import findspark 
findspark.init("/Users/valentinaporcu/spark/spark-2.4.1-bin-hadoop2.7")
import pyspark 
from pyspark.sql import DataFrameNaFunctions 
from pyspark.sql.functions import lit 
from pyspark.ml.feature import StringIndexer  
from pyspark.ml import Pipeline 
from pyspark.sql import SparkSession
from pyspark.sql import functions
import pandas as pd
import numpy as np

In [2]:
spark = SparkSession.builder.appName("lsvc").getOrCreate()

In [3]:
df = spark.read.csv("/Users/valentinaporcu/Dropbox/topic/12.\ Guida\ ai\ Big\ Data\ con\ Python/codice\ -\ guida\ ai\ big\ data\ con\ Python/Sezione\ 3/pid.csv", 
                     sep = ',', inferSchema=True, header=True)

In [4]:
df.head()

Row(_c0=1, pregnant=6, glucose=148, pressure=72, triceps=35, insulin=0, mass=33.6, pedigree=0.627, age=50, diabetes='pos')

In [5]:
df.printSchema

<bound method DataFrame.printSchema of DataFrame[_c0: int, pregnant: int, glucose: int, pressure: int, triceps: int, insulin: int, mass: double, pedigree: double, age: int, diabetes: string]>

In [6]:
df.columns

['_c0',
 'pregnant',
 'glucose',
 'pressure',
 'triceps',
 'insulin',
 'mass',
 'pedigree',
 'age',
 'diabetes']

In [7]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [8]:
assembler = VectorAssembler(inputCols = ['pregnant',
 'glucose',
 'pressure',
 'mass'], outputCol = "features")

In [9]:
output = assembler.transform(df)

In [10]:
output.show()

+---+--------+-------+--------+-------+-------+----+--------+---+--------+--------------------+
|_c0|pregnant|glucose|pressure|triceps|insulin|mass|pedigree|age|diabetes|            features|
+---+--------+-------+--------+-------+-------+----+--------+---+--------+--------------------+
|  1|       6|    148|      72|     35|      0|33.6|   0.627| 50|     pos|[6.0,148.0,72.0,3...|
|  2|       1|     85|      66|     29|      0|26.6|   0.351| 31|     neg|[1.0,85.0,66.0,26.6]|
|  3|       8|    183|      64|      0|      0|23.3|   0.672| 32|     pos|[8.0,183.0,64.0,2...|
|  4|       1|     89|      66|     23|     94|28.1|   0.167| 21|     neg|[1.0,89.0,66.0,28.1]|
|  5|       0|    137|      40|     35|    168|43.1|   2.288| 33|     pos|[0.0,137.0,40.0,4...|
|  6|       5|    116|      74|      0|      0|25.6|   0.201| 30|     neg|[5.0,116.0,74.0,2...|
|  7|       3|     78|      50|     32|     88|31.0|   0.248| 26|     pos|[3.0,78.0,50.0,31.0]|
|  8|      10|    115|       0|      0| 

In [11]:
indexer = StringIndexer(inputCol="diabetes", outputCol="diabetesIndex")
indexed = indexer.fit(output).transform(output)

In [12]:
indexed.show()

+---+--------+-------+--------+-------+-------+----+--------+---+--------+--------------------+-------------+
|_c0|pregnant|glucose|pressure|triceps|insulin|mass|pedigree|age|diabetes|            features|diabetesIndex|
+---+--------+-------+--------+-------+-------+----+--------+---+--------+--------------------+-------------+
|  1|       6|    148|      72|     35|      0|33.6|   0.627| 50|     pos|[6.0,148.0,72.0,3...|          1.0|
|  2|       1|     85|      66|     29|      0|26.6|   0.351| 31|     neg|[1.0,85.0,66.0,26.6]|          0.0|
|  3|       8|    183|      64|      0|      0|23.3|   0.672| 32|     pos|[8.0,183.0,64.0,2...|          1.0|
|  4|       1|     89|      66|     23|     94|28.1|   0.167| 21|     neg|[1.0,89.0,66.0,28.1]|          0.0|
|  5|       0|    137|      40|     35|    168|43.1|   2.288| 33|     pos|[0.0,137.0,40.0,4...|          1.0|
|  6|       5|    116|      74|      0|      0|25.6|   0.201| 30|     neg|[5.0,116.0,74.0,2...|          0.0|
|  7|     

In [13]:
transformed_df = indexed.select('features','diabetesIndex')

In [14]:
train, test = transformed_df.randomSplit([0.7,0.3])

In [15]:
train.show()

+--------------------+-------------+
|            features|diabetesIndex|
+--------------------+-------------+
|      (4,[1],[94.0])|          0.0|
|[0.0,57.0,60.0,21.7]|          0.0|
|[0.0,67.0,76.0,45.3]|          0.0|
| [0.0,73.0,0.0,21.1]|          0.0|
|[0.0,78.0,88.0,36.9]|          0.0|
|[0.0,84.0,64.0,35.8]|          0.0|
|[0.0,84.0,82.0,38.2]|          0.0|
|[0.0,86.0,68.0,35.8]|          0.0|
|[0.0,91.0,68.0,39.9]|          0.0|
|[0.0,91.0,80.0,32.4]|          0.0|
|[0.0,93.0,60.0,28.7]|          0.0|
|[0.0,93.0,60.0,35.3]|          0.0|
|[0.0,93.0,100.0,4...|          0.0|
|[0.0,94.0,70.0,43.5]|          0.0|
|[0.0,95.0,64.0,44.6]|          0.0|
|[0.0,97.0,64.0,36.8]|          0.0|
|[0.0,98.0,82.0,25.2]|          0.0|
| [0.0,99.0,0.0,25.0]|          0.0|
|[0.0,100.0,70.0,3...|          0.0|
|[0.0,100.0,88.0,4...|          0.0|
+--------------------+-------------+
only showing top 20 rows



In [16]:
from pyspark.ml.classification import LinearSVC

In [17]:
lsvc = LinearSVC(labelCol='diabetesIndex', maxIter=10, regParam=0.1)

In [18]:
lsvc_model = lsvc.fit(train)

In [19]:
lsvc_predictions = lsvc_model.transform(test)

In [20]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [21]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction', 
                                          labelCol='diabetesIndex')

In [22]:
lsvc_acc = evaluator.evaluate(lsvc_predictions)

In [23]:
lsvc_predictions.show()

+--------------------+-------------+--------------------+----------+
|            features|diabetesIndex|       rawPrediction|prediction|
+--------------------+-------------+--------------------+----------+
|[0.0,74.0,52.0,27.8]|          0.0|[2.52095542077943...|       0.0|
|[0.0,95.0,80.0,36.5]|          0.0|[1.60473301712541...|       0.0|
|[0.0,95.0,85.0,37.4]|          1.0|[1.59643358003898...|       0.0|
|[0.0,102.0,75.0,0.0]|          0.0|[3.83210706801022...|       0.0|
|[0.0,104.0,64.0,2...|          0.0|[1.76201542740329...|       0.0|
|[0.0,104.0,76.0,1...|          0.0|[2.52981872510546...|       0.0|
|[0.0,105.0,64.0,4...|          0.0|[0.79892925023075...|       0.0|
|[0.0,105.0,84.0,2...|          1.0|[1.93777453678167...|       0.0|
|[0.0,106.0,70.0,3...|          0.0|[0.97612133163909...|       0.0|
|[0.0,107.0,62.0,3...|          1.0|[1.05253332970850...|       0.0|
|[0.0,107.0,76.0,4...|          0.0|[0.60817711774667...|       0.0|
|[0.0,111.0,65.0,2...|          0.

In [24]:
lsvc_acc

0.7495124113475178