In [1]:
!pwd  

/opt/spark/work-dir


In [2]:
!ls

data  random_forest.ipynb


In [3]:
import pyspark
print(pyspark.__version__)
# import py4j
# !pip install cassandra-driver

3.5.4


In [4]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from cassandra.cluster import Cluster
import pandas as pd
from cassandra.auth import PlainTextAuthProvider

### 1. Chargement des données brutes

In [19]:
from pyspark.sql import SparkSession

# Initialiser Spark
spark = SparkSession.builder \
    .appName("Data Cleaning with Spark") \
    .getOrCreate()

# Charger les données brutes
clients = spark.read.csv("data/raw/clients.csv", header=True, inferSchema=True)
catalogue = spark.read.csv("data/raw/catalogue.csv", header=True, inferSchema=True)
immatriculation = spark.read.csv("data/raw/immatriculations.csv", header=True, inferSchema=True)

# Afficher les schémas
clients.printSchema()
catalogue.printSchema()
immatriculation.printSchema()

root
 |-- age: string (nullable = true)
 |-- sexe: string (nullable = true)
 |-- taux: string (nullable = true)
 |-- situationFamiliale: string (nullable = true)
 |-- nbEnfantsAcharge: string (nullable = true)
 |-- 2eme voiture: string (nullable = true)
 |-- immatriculation: string (nullable = true)

root
 |-- marque: string (nullable = true)
 |-- nom: string (nullable = true)
 |-- puissance: integer (nullable = true)
 |-- longueur: string (nullable = true)
 |-- nbPlaces: integer (nullable = true)
 |-- nbPortes: integer (nullable = true)
 |-- couleur: string (nullable = true)
 |-- occasion: boolean (nullable = true)
 |-- prix: integer (nullable = true)

root
 |-- immatriculation: string (nullable = true)
 |-- marque: string (nullable = true)
 |-- nom: string (nullable = true)
 |-- puissance: integer (nullable = true)
 |-- longueur: string (nullable = true)
 |-- nbPlaces: integer (nullable = true)
 |-- nbPortes: integer (nullable = true)
 |-- couleur: string (nullable = true)
 |-- occas

In [20]:
clients.head()

Row(age='25', sexe='F', taux='159', situationFamiliale='En Couple', nbEnfantsAcharge='2', 2eme voiture='false', immatriculation='3467 SB 72')

### 2. Exploration des données (EDA) et nettoyage

In [5]:
# Se connecter à Cassandra
auth_provider = PlainTextAuthProvider(username='cassandra', password='cassandra')
cluster = Cluster(['172.18.0.2'], port=9042, auth_provider=auth_provider)
session = cluster.connect('concessionnaire')

In [14]:
# Charger les données depuis les fichiers CSV
clients = pd.read_csv("/opt/spark/work-dir/data/raw/clients.csv")
catalogue = pd.read_csv("/opt/spark/work-dir/data/raw/catalogue.csv")
immatriculation = pd.read_csv("/opt/spark/work-dir/data/raw/immatriculations.csv")

# Insérer les données dans Cassandra
def insert_data(session, table, data):
    for index, row in data.iterrows():
        # Encadrer les noms de colonnes avec des espaces
        columns = ', '.join([f'"{col}"' if ' ' in col else col for col in row.index])
        values = ', '.join(['%s'] * len(row))
        query = f"INSERT INTO {table} ({columns}) VALUES ({values})"
        session.execute(query, tuple(row))

# Insérer les données
insert_data(session, "clients", clients)
# insert_data(session, "catalogue", catalogue)
# insert_data(session, "immatriculation", immatriculation)

print("Données insérées avec succès dans Cassandra.")

InvalidRequest: Error from server: code=2200 [Invalid query] message="Invalid STRING constant (25) for "age" of type int"

In [20]:
# Initialize Spark session with Cassandra connector
spark = SparkSession.builder \
    .appName("RandomForestWithCassandra") \
    .config("spark.cassandra.connection.host", "172.18.0.2") \
    .config("spark.cassandra.connection.port", "9042") \
    .config("spark.cassandra.auth.username", "cassandra") \
    .config("spark.cassandra.auth.password", "cassandra") \
    .config("spark.jars.packages", "com.datastax.spark:spark-cassandra-connector_2.12:3.2.0") \
    .getOrCreate()

### Script to load data into Cassandra

In [24]:
# Se connecter à Cassandra
# Configuration de la connexion Cassandra
auth_provider = PlainTextAuthProvider(username='cassandra', password='cassandra')  
cluster = Cluster(['172.18.0.2'], port=9042, auth_provider=auth_provider)  
session = cluster.connect('concessionnaire')  



In [32]:
# Load Clients Data
clients = pd.read_csv("./data/raw/clients.csv")
for _, row in clients.iterrows():
    session.execute(
        """
        INSERT INTO clients (immatriculation, age, sexe, taux, situationFamiliale, nbEnfantsAcharge, "2eme voiture")
        VALUES (%s, %s, %s, %s, %s, %s, %s)
        """,
        (row['immatriculation'], row['age'], row['sexe'], row['taux'], row['situationFamiliale'], row['nbEnfantsAcharge'], row['2eme voiture'])
    )

InvalidRequest: Error from server: code=2200 [Invalid query] message="Invalid STRING constant (25) for "age" of type int"

In [None]:


# Load Catalogue Data
catalogue = pd.read_csv("../data/processed/catalogue_processed.csv")
for _, row in catalogue.iterrows():
    session.execute(
        """
        INSERT INTO catalogue (marque, nom, puissance, longueur, nbPlaces, nbPortes, prix, occasion)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
        """,
        (row['marque'], row['nom'], row['puissance'], row['longueur'], row['nbPlaces'], row['nbPortes'], row['prix'], row['occasion'])
    )

# Load Immatriculations Data
immatriculations = pd.read_csv("../data/processed/immatriculations_processed.csv")
for _, row in immatriculations.iterrows():
    session.execute(
        """
        INSERT INTO immatriculations (immatriculation, marque, nom, puissance, longueur, nbPlaces, nbPortes, prix, occasion)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
        """,
        (row['immatriculation'], row['marque'], row['nom'], row['puissance'], row['longueur'], row['nbPlaces'], row['nbPortes'], row['prix'], row['occasion'])
    )

print("Data loaded into Cassandra successfully!")

In [14]:
# Load data from Cassandra (exemple avec la table `clients`)
clients = spark.read \
    .format("org.apache.spark.sql.cassandra") \
    .options(table="clients", keyspace="concessionnaire") \
    .load()

# Show data schema and preview
clients.printSchema()
clients.show()

root
 |-- immatriculation: string (nullable = false)
 |-- 2eme voiture: boolean (nullable = true)
 |-- age: integer (nullable = true)
 |-- nbenfantsacharge: integer (nullable = true)
 |-- sexe: string (nullable = true)
 |-- situationfamiliale: string (nullable = true)
 |-- taux: integer (nullable = true)



25/01/17 20:55:34 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance


+---------------+------------+---+----------------+----+------------------+----+
|immatriculation|2eme voiture|age|nbenfantsacharge|sexe|situationfamiliale|taux|
+---------------+------------+---+----------------+----+------------------+----+
+---------------+------------+---+----------------+----+------------------+----+



In [15]:
# Load data from Cassandra (exemple avec la table `catalogue`)
catalogue = spark.read \
    .format("org.apache.spark.sql.cassandra") \
    .options(table="catalogue", keyspace="concessionnaire") \
    .load()

# Show data schema and preview
catalogue.printSchema()
catalogue.show()

root
 |-- marque: string (nullable = false)
 |-- nom: string (nullable = true)
 |-- couleur: string (nullable = true)
 |-- longueur: string (nullable = true)
 |-- nbplaces: integer (nullable = true)
 |-- nbportes: integer (nullable = true)
 |-- occasion: boolean (nullable = true)
 |-- prix: decimal(38,18) (nullable = true)
 |-- puissance: integer (nullable = true)



25/01/17 20:55:54 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance


+------+---+-------+--------+--------+--------+--------+----+---------+
|marque|nom|couleur|longueur|nbplaces|nbportes|occasion|prix|puissance|
+------+---+-------+--------+--------+--------+--------+----+---------+
+------+---+-------+--------+--------+--------+--------+----+---------+



In [18]:
# Load data from Cassandra (exemple avec la table `immatriculation`)
immatriculation = spark.read \
    .format("org.apache.spark.sql.cassandra") \
    .options(table="immatriculation", keyspace="concessionnaire") \
    .load()

# Show data schema and preview
immatriculation.printSchema()
immatriculation.show()

root
 |-- immatriculation: string (nullable = false)
 |-- couleur: string (nullable = true)
 |-- longueur: string (nullable = true)
 |-- marque: string (nullable = true)
 |-- nbplaces: integer (nullable = true)
 |-- nbportes: integer (nullable = true)
 |-- nom: string (nullable = true)
 |-- occasion: boolean (nullable = true)
 |-- prix: decimal(38,18) (nullable = true)
 |-- puissance: integer (nullable = true)



25/01/17 20:56:55 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance


+---------------+-------+--------+------+--------+--------+---+--------+----+---------+
|immatriculation|couleur|longueur|marque|nbplaces|nbportes|nom|occasion|prix|puissance|
+---------------+-------+--------+------+--------+--------+---+--------+----+---------+
+---------------+-------+--------+------+--------+--------+---+--------+----+---------+



In [5]:

# Prepare data
assembler = VectorAssembler(inputCols=["feature1", "feature2", "feature3"], outputCol="features")
data = assembler.transform(df).select("features", "label")

# Split data into training and testing sets
train_data, test_data = data.randomSplit([0.8, 0.2])


In [6]:

# Train Random Forest model
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)
model = rf.fit(train_data)


25/01/14 21:47:49 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance
25/01/14 21:47:50 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance
25/01/14 21:47:51 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance
25/01/14 21:47:52 WARN DecisionTreeMetadata: DecisionTree reducing maxBins from 32 to 2 (= number of training instances)
25/01/14 21:47:54 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance
25/01/14 21:47:54 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance


In [7]:

# Evaluate the model
predictions = model.transform(test_data)
evaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

print(f"Model Accuracy: {accuracy}")


25/01/14 21:47:58 WARN V2ScanPartitioningAndOrdering: Spark ignores the partitioning CassandraPartitioning. Please use KeyGroupedPartitioning for better performance


Model Accuracy: nan


In [16]:

# Stop Spark session
# spark.stop()
