In [22]:
import sys
print(sys.executable)

/usr/bin/python3


### Import des bibliothèques

In [23]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.ml import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

In [24]:
# Initialiser Spark avec le connecteur Cassandra
spark = SparkSession.builder \
    .appName("VehicleClusterClassification") \
    .config("spark.cassandra.connection.host", "172.18.0.2") \
    .config("spark.cassandra.connection.port", "9042") \
    .config("spark.cassandra.auth.username", "cassandra") \
    .config("spark.cassandra.auth.password", "cassandra") \
    .config("spark.jars.packages", "com.datastax.spark:spark-cassandra-connector_2.12:3.2.0") \
    .getOrCreate()


### Chargement des données 

In [25]:
# Chargement des données depuis Cassandra
immatriculations = spark.read \
    .format("org.apache.spark.sql.cassandra") \
    .options(table="immatriculations_clusters", keyspace="concessionnaire") \
    .load()
clients =  spark.read \
    .format("org.apache.spark.sql.cassandra") \
    .options(table="clients", keyspace="concessionnaire") \
    .load()


25/01/19 13:05:56 WARN PlainTextAuthProviderBase: [] /172.18.0.2:9042 did not send an authentication challenge; This is suspicious because the driver expects authentication
25/01/19 13:05:56 WARN PlainTextAuthProviderBase: [] /172.18.0.2:9042 did not send an authentication challenge; This is suspicious because the driver expects authentication


### Prétraitement des données

In [26]:
# Fusion des données
clients_immatriculations = clients.join(immatriculations, on="immatriculation", how="left")

In [27]:
# Filtrage des voitures neuves
clients_immatriculations = clients_immatriculations.filter(clients_immatriculations["occasion"] == False)


In [28]:
clients_immatriculations.show(3)

25/01/19 13:05:57 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance
25/01/19 13:05:57 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance

+---------------+------------+---+----------------+----+------------------+----+-------+--------+------+--------+--------+-------------+--------+----------+--------------------+---------+
|immatriculation|2eme voiture|age|nbenfantsacharge|sexe|situationfamiliale|taux|couleur|longueur|marque|nbplaces|nbportes|          nom|occasion|prediction|                prix|puissance|
+---------------+------------+---+----------------+----+------------------+----+-------+--------+------+--------+--------+-------------+--------+----------+--------------------+---------+
|       38 BE 37|       false| 71|               0|   M|         En Couple| 855|  blanc|  longue|  Saab|       5|       5|     9.3 1.8T|   false|         0|38600.00000000000...|      150|
|     1609 WD 75|        true| 38|               0|   M|         En Couple| 806|  blanc|  courte|  Audi|       5|       5|       A2 1.4|   false|         0|18310.00000000000...|       75|
|     2462 AR 34|       false| 50|               1|   M|    

                                                                                

### Classification

#### Random forests

In [29]:
# Encodage des variables catégoriques
from pyspark.ml.feature import StringIndexer

indexers = [StringIndexer(inputCol=col_name, outputCol=f"{col_name}_index").fit(clients_immatriculations) for col_name in ['situationfamiliale']]
for indexer in indexers:
    clients_immatriculations = indexer.transform(clients_immatriculations)

25/01/19 13:06:01 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance
25/01/19 13:06:01 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance
                                                                                

In [30]:
# Étape 1 : Assembler les features
features = ["age", "taux", "situationfamiliale_index", "nbenfantsacharge", "2eme voiture"]
assembler = VectorAssembler(inputCols=features, outputCol="features")
data_with_features = assembler.transform(clients_immatriculations)


In [31]:

# Étape 2 : Renommer la colonne `prediction` en `label`
data_with_features = data_with_features.withColumnRenamed("prediction", "label")

In [32]:
data_with_features.show(3)

25/01/19 13:06:04 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance
25/01/19 13:06:04 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance
                                                                                

+---------------+------------+---+----------------+----+------------------+----+-------+--------+------+--------+--------+-------------+--------+-----+--------------------+---------+------------------------+--------------------+
|immatriculation|2eme voiture|age|nbenfantsacharge|sexe|situationfamiliale|taux|couleur|longueur|marque|nbplaces|nbportes|          nom|occasion|label|                prix|puissance|situationfamiliale_index|            features|
+---------------+------------+---+----------------+----+------------------+----+-------+--------+------+--------+--------+-------------+--------+-----+--------------------+---------+------------------------+--------------------+
|     6882 WL 51|       false| 18|               2|   F|         En Couple|1129|  blanc|  longue|Jaguar|       5|       5|X-Type 2.5 V6|   false|    0|37100.00000000000...|      197|                     0.0|[18.0,1129.0,0.0,...|
|     9806 ZK 39|       false| 52|               0|   F|       Célibataire|1150|  bl

In [33]:
# Étape 3 : Sélectionner uniquement les colonnes nécessaires
final_data = data_with_features.select("features", "label")

In [34]:
# Étape 4 : Diviser les données en ensembles d'entraînement et de test
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)

In [35]:
train_data.show(3)

25/01/19 13:06:06 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance
25/01/19 13:06:06 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance
                                                                                

+--------------------+-----+
|            features|label|
+--------------------+-----+
|(5,[0,1],[18.0,54...|    0|
|(5,[0,1],[18.0,54...|    0|
|(5,[0,1],[18.0,54...|    0|
+--------------------+-----+
only showing top 3 rows



In [36]:
# Étape 5 : Configurer le modèle Random Forest
rf = RandomForestClassifier(labelCol="label", featuresCol="features", predictionCol="rf_prediction")

In [37]:
# Étape 6 : Entraîner le modèle
rf_model = rf.fit(train_data)

25/01/19 13:06:08 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance
25/01/19 13:06:08 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance
25/01/19 13:06:10 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance
25/01/19 13:06:10 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance
25/01/19 13:06:12 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance
25/01/19 13:06:12 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance
25/01/19 1

In [38]:
# Étape 7 : Faire des prédictions
predictions = rf_model.transform(test_data)

In [39]:
# Étape 8 : Évaluer le modèle
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="rf_prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Accuracy of the Random Forest model: {accuracy:.2f}")


25/01/19 13:06:20 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance
25/01/19 13:06:20 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance
                                                                                

Accuracy of the Random Forest model: 0.86


In [41]:
import os

# Chemin du répertoire où le modèle sera sauvegardé
model_dir = "models/random_forest_model"

# Vérifier si le répertoire existe, sinon le créer
if not os.path.exists("models"):
    os.makedirs("models")

# Sauvegarder le modèle
rf_model.save(model_dir)

print(f"Le modèle a été sauvegardé dans le répertoire : {model_dir}")


                                                                                

Le modèle a été sauvegardé dans le répertoire : models/random_forest_model
