In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.sql.functions import col
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [2]:
# Créer une SparkSession
spark = SparkSession.builder \
    .appName("DiabetesClassification") \
    .getOrCreate()

In [3]:
data = spark.read.csv("file:/content/diabetes.csv", header=True, inferSchema=True)
data.printSchema()
data.show(5)

root
 |-- PatientID: integer (nullable = true)
 |-- Pregnancies: integer (nullable = true)
 |-- PlasmaGlucose: integer (nullable = true)
 |-- DiastolicBloodPressure: integer (nullable = true)
 |-- TricepsThickness: integer (nullable = true)
 |-- SerumInsulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigree: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Diabetic: integer (nullable = true)

+---------+-----------+-------------+----------------------+----------------+------------+-----------+----------------+---+--------+
|PatientID|Pregnancies|PlasmaGlucose|DiastolicBloodPressure|TricepsThickness|SerumInsulin|        BMI|DiabetesPedigree|Age|Diabetic|
+---------+-----------+-------------+----------------------+----------------+------------+-----------+----------------+---+--------+
|  1354778|          0|          171|                    80|              34|          23|43.50972593|     1.213191354| 21|       0|
|  1147438|          

In [4]:
# Assembler les colonnes de caractéristiques
feature_columns = [col for col in data.columns if col != 'Diabetic']
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
data_prepared = assembler.transform(data)

In [5]:
# Diviser les données en ensembles d'entraînement et de test
train_data, test_data = data_prepared.randomSplit([0.8, 0.2])

In [6]:
# Entraîner un modèle de régression logistique
lr = LogisticRegression(featuresCol="features", labelCol="Diabetic")
lr_model = lr.fit(train_data)

In [7]:
# Faire des prédictions sur les données de test
predictions = lr_model.transform(test_data)
predictions.select("features", "Diabetic", "prediction").show(5)

+--------------------+--------+----------+
|            features|Diabetic|prediction|
+--------------------+--------+----------+
|[1000326.0,3.0,11...|       1|       1.0|
|[1000471.0,1.0,53...|       0|       0.0|
|[1000652.0,2.0,13...|       1|       0.0|
|[1001229.0,2.0,82...|       1|       0.0|
|[1001511.0,4.0,97...|       1|       1.0|
+--------------------+--------+----------+
only showing top 5 rows



In [8]:
# Evaluation a l'aide de la courve roc
evaluator = BinaryClassificationEvaluator(labelCol="Diabetic", rawPredictionCol="prediction", metricName="areaUnderROC")
areaUnderROC = evaluator.evaluate(predictions)
print(f"ROC: {areaUnderROC}")


ROC: 0.7511205248139322
