### Attribute Information: (classes: edible=e, poisonous=p)

cap-shape: bell=b,conical=c,convex=x,flat=f, knobbed=k,sunken=s

cap-surface: fibrous=f,grooves=g,scaly=y,smooth=s

cap-color: brown=n,buff=b,cinnamon=c,gray=g,green=r,pink=p,purple=u,red=e,white=w,yellow=y

bruises: bruises=t,no=f

odor: almond=a,anise=l,creosote=c,fishy=y,foul=f,musty=m,none=n,pungent=p,spicy=s

gill-attachment: attached=a,descending=d,free=f,notched=n

gill-spacing: close=c,crowded=w,distant=d

gill-size: broad=b,narrow=n

gill-color: black=k,brown=n,buff=b,chocolate=h,gray=g, green=r,orange=o,pink=p,purple=u,red=e,white=w,yellow=y

stalk-shape: enlarging=e,tapering=t

stalk-root: bulbous=b,club=c,cup=u,equal=e,rhizomorphs=z,rooted=r,missing=?

stalk-surface-above-ring: fibrous=f,scaly=y,silky=k,smooth=s

stalk-surface-below-ring: fibrous=f,scaly=y,silky=k,smooth=s

stalk-color-above-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

stalk-color-below-ring: brown=n,buff=b,cinnamon=c,gray=g,orange=o,pink=p,red=e,white=w,yellow=y

veil-type: partial=p,universal=u

veil-color: brown=n,orange=o,white=w,yellow=y

ring-number: none=n,one=o,two=t

ring-type: cobwebby=c,evanescent=e,flaring=f,large=l,none=n,pendant=p,sheathing=s,zone=z

spore-print-color: black=k,brown=n,buff=b,chocolate=h,green=r,orange=o,purple=u,white=w,yellow=y

population: abundant=a,clustered=c,numerous=n,scattered=s,several=v,solitary=y

habitat: grasses=g,leaves=l,meadows=m,paths=p,urban=u,waste=w,woods=d

In [1]:
from pyspark.sql import SparkSession
import pandas as pd
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier, LinearSVC, LinearSVCModel, GBTClassificationModel, RandomForestClassificationModel, DecisionTreeClassificationModel, LogisticRegressionModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator


In [3]:
spark = SparkSession.builder.master("local").appName("Mushrooms_Classification").getOrCreate()

In [3]:
spark

In [4]:
df_pandas = pd.read_csv("mushrooms.csv")
col = df_pandas.columns
col

Index(['class', 'cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat'],
      dtype='object')

## Loading data

In [5]:
df_pyspark = spark.read.csv("mushrooms.csv", header=True, inferSchema=True)

In [6]:
df_pyspark.show(5)

+-----+---------+-----------+---------+-------+----+---------------+------------+---------+----------+-----------+----------+------------------------+------------------------+----------------------+----------------------+---------+----------+-----------+---------+-----------------+----------+-------+
|class|cap-shape|cap-surface|cap-color|bruises|odor|gill-attachment|gill-spacing|gill-size|gill-color|stalk-shape|stalk-root|stalk-surface-above-ring|stalk-surface-below-ring|stalk-color-above-ring|stalk-color-below-ring|veil-type|veil-color|ring-number|ring-type|spore-print-color|population|habitat|
+-----+---------+-----------+---------+-------+----+---------------+------------+---------+----------+-----------+----------+------------------------+------------------------+----------------------+----------------------+---------+----------+-----------+---------+-----------------+----------+-------+
|    p|        x|          s|        n|      t|   p|              f|           c|        n|   

In [7]:
df_pyspark.printSchema()

root
 |-- class: string (nullable = true)
 |-- cap-shape: string (nullable = true)
 |-- cap-surface: string (nullable = true)
 |-- cap-color: string (nullable = true)
 |-- bruises: string (nullable = true)
 |-- odor: string (nullable = true)
 |-- gill-attachment: string (nullable = true)
 |-- gill-spacing: string (nullable = true)
 |-- gill-size: string (nullable = true)
 |-- gill-color: string (nullable = true)
 |-- stalk-shape: string (nullable = true)
 |-- stalk-root: string (nullable = true)
 |-- stalk-surface-above-ring: string (nullable = true)
 |-- stalk-surface-below-ring: string (nullable = true)
 |-- stalk-color-above-ring: string (nullable = true)
 |-- stalk-color-below-ring: string (nullable = true)
 |-- veil-type: string (nullable = true)
 |-- veil-color: string (nullable = true)
 |-- ring-number: string (nullable = true)
 |-- ring-type: string (nullable = true)
 |-- spore-print-color: string (nullable = true)
 |-- population: string (nullable = true)
 |-- habitat: string 

## Preprocessing Data

In [8]:
df = (
    df_pyspark.withColumnRenamed("class", "classe")
      .withColumnRenamed("cap-shape", "forme_du_chapeau")
      .withColumnRenamed("cap-surface", "surface_du_chapeau")
      .withColumnRenamed("cap-color", "couleur_du_chapeau")
      .withColumnRenamed("bruises", "meurtrissures")
      .withColumnRenamed("odor", "odeur")
      .withColumnRenamed("gill-attachment", "attache_des_lamelles")
      .withColumnRenamed("gill-spacing", "espacement_des_lamelles")
      .withColumnRenamed("gill-size", "taille_des_lamelles")
      .withColumnRenamed("gill-color", "couleur_des_lamelles")
      .withColumnRenamed("stalk-shape", "forme_du_pied")
      .withColumnRenamed("stalk-root", "racine_du_pied")
      .withColumnRenamed("stalk-surface-above-ring", "surface_du_pied_au_dessus_de_lanneau")
      .withColumnRenamed("stalk-surface-below-ring", "surface_du_pied_en_dessous_de_lanneau")
      .withColumnRenamed("stalk-color-above-ring", "couleur_du_pied_au_dessus_de_lanneau")
      .withColumnRenamed("stalk-color-below-ring", "couleur_du_pied_en_dessous_de_lanneau")
      .withColumnRenamed("veil-type", "type_de_voile")
      .withColumnRenamed("veil-color", "couleur_du_voile")
      .withColumnRenamed("ring-number", "nombre_danneaux")
      .withColumnRenamed("ring-type", "type_danneau")
      .withColumnRenamed("spore-print-color", "couleur_de_lempreinte_des_spores")
      .withColumnRenamed("population", "population")
      .withColumnRenamed("habitat", "habitat")
)

24/10/30 11:43:18 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [None]:
df.show(5)

## Training Model

In [9]:
# Liste des colonnes à indexer (toutes les colonnes du DataFrame)
columns = df.columns

# Indexer la colonne "classe" pour la transformer en étiquette numérique
label_indexer = StringIndexer(inputCol="classe", outputCol="classe_indexed")

# Créer les indexeurs pour chaque colonne sauf la colonne "classe" qui est la colonne de label
indexers = [StringIndexer(inputCol=col, outputCol=col + "_indexed", handleInvalid="keep") for col in columns[1:]]

# Assembler toutes les colonnes indexées en une seule colonne de caractéristiques
assembler = VectorAssembler(inputCols=[col + "_indexed" for col in columns[1:]], outputCol="features")

# Définir le modèle de régression logistique
lr = LogisticRegression(featuresCol="features", labelCol="classe_indexed", maxIter=10)

# Créer le pipeline avec les étapes de transformation et le modèle
pipeline = Pipeline(stages=[label_indexer] + indexers + [assembler, lr])

# Diviser les données en ensembles d'entraînement et de test
train, test = df.randomSplit([0.8, 0.2], seed=17)

# Entraîner le pipeline
pipeline_model = pipeline.fit(train)

# Sauvegarder le pipeline complet
pipeline_model.save("models/logisticRegressionPipeline")

24/10/30 11:43:24 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/10/30 11:43:24 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


In [10]:
# Faire des prédictions sur l'ensemble de test
predictions = pipeline_model.transform(test)

# Afficher les prédictions
predictions.select("classe_indexed", "prediction").show(5)

+--------------+----------+
|classe_indexed|prediction|
+--------------+----------+
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
+--------------+----------+
only showing top 5 rows



24/10/30 11:43:32 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


## Evaluate model : Logistic Regression

In [11]:
roc_LG = BinaryClassificationEvaluator(labelCol="classe_indexed", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
roc_LG = roc_LG.evaluate(predictions)
accuracy_LG = BinaryClassificationEvaluator(labelCol="classe_indexed", rawPredictionCol="rawPrediction", metricName="areaUnderPR")
accuracy_LG = accuracy_LG.evaluate(predictions)

In [12]:
print(f"ROC : {roc_LG}")
print(f"Accuracy : {accuracy_LG}")

ROC : 0.9938532755185764
Accuracy : 0.9880963818824631


## Training and evaluate DecisionTree Model:

In [19]:
dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'classe')
decisiontree_train = dt.fit(train)
decisiontree_train.save("decisiontree")

In [20]:
load_decisiontree = DecisionTreeClassificationModel.load("decisiontree")
predictions = load_decisiontree.transform(test)

#### Evaluation

In [None]:
roc_DT = BinaryClassificationEvaluator(labelCol="classe", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
roc_DT = roc_DT.evaluate(predictions)
accuracy_DT = BinaryClassificationEvaluator(labelCol="classe", rawPredictionCol="rawPrediction", metricName="areaUnderPR")
accuracy_DT = accuracy_DT.evaluate(predictions)

## Training and Evaluate RandomForest Model:

In [22]:
rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'classe')
randomForest_train = rf.fit(train)
randomForest_train.save("randomForest")

In [23]:
load_randomForest = RandomForestClassificationModel.load("randomForest")
predictions = load_randomForest.transform(test)

#### Eval

In [24]:
roc_RF = BinaryClassificationEvaluator(labelCol="classe", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
roc_RF = roc_RF.evaluate(predictions)
accuracy_RF = BinaryClassificationEvaluator(labelCol="classe", rawPredictionCol="rawPrediction", metricName="areaUnderPR")
accuracy_RF = accuracy_RF.evaluate(predictions)

## Training and Evaluate Gradient-BoostTree Model

In [25]:
gb = GBTClassifier(featuresCol = 'features', labelCol = 'classe')
gbtModel_train = gb.fit(train)
gbtModel_train.save("gbtModel")

In [26]:
load_gbtModel = GBTClassificationModel.load("gbtModel")
predictions = load_gbtModel.transform(test)

In [27]:
roc_GB = BinaryClassificationEvaluator(labelCol="classe", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
roc_GB = roc_GB.evaluate(predictions)
accuracy_GB = BinaryClassificationEvaluator(labelCol="classe", rawPredictionCol="rawPrediction", metricName="areaUnderPR")
accuracy_GB = accuracy_GB.evaluate(predictions)

## Training and Evaluate SVM Model:

In [29]:
sv = LinearSVC(featuresCol = 'features', labelCol = 'classe')
sv_train = sv.fit(train)
# sv_train.save("linearSVCModel")

In [30]:
loadmodel = LinearSVCModel.load("linearSVCModel")
predictions = loadmodel.transform(test)

In [31]:
roc_SV = BinaryClassificationEvaluator(labelCol="classe", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
roc_SV = roc_SV.evaluate(predictions)
accuracy_SV = BinaryClassificationEvaluator(labelCol="classe", rawPredictionCol="rawPrediction", metricName="areaUnderPR")
accuracy_SV = accuracy_SV.evaluate(predictions)

In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

# Create a DataFrame with the model names and their scores
model_scores = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient-Boosted Tree', 'Linear SVC'],
    'ROC Score': [roc_LG, roc_DT, roc_RF, roc_GB, roc_SV],
    'PR Score': [accuracy_LG, accuracy_DT, accuracy_GB, accuracy_RF, accuracy_SV]  # Add other PR scores if available
})
print(model_scores)
# Plot the scores using seaborn
plt.figure(figsize=(12, 6))

# Plot ROC Scores
plt.subplot(1, 2, 1)
sns.barplot(x='Model', y='ROC Score', data=model_scores)
plt.title('ROC Scores')
plt.ylabel('ROC Score')
plt.xlabel('Model')
plt.ylim(0.97, 1.0)  # Adjust the y-axis to better visualize the differences

# Plot PR Scores
plt.subplot(1, 2, 2)
sns.barplot(x='Model', y='PR Score', data=model_scores)
plt.title('PR Scores')
plt.ylabel('PR Score')
plt.xlabel('Model')
plt.ylim(0.97, 1.0)  # Adjust the y-axis to better visualize the differences

plt.tight_layout()
plt.show()

### ROC Score (Receiver Operating Characteristic) :

Le score ROC, ou AUC (Area Under the Curve), mesure la capacité d'un modèle à distinguer entre les classes positives et négatives.

Un score de 1.0 signifie une séparation parfaite des classes (aucune erreur de classification), tandis qu'un score de 0.5 signifie que le modèle ne fait pas mieux qu'un tirage aléatoire.

Ici, tous les modèles ont un ROC élevé (proches de 1.0), indiquant qu’ils séparent très bien les classes.

### PR Score (Precision-Recall) :

Le score PR, ou courbe de précision-rappel, est souvent plus informatif que le ROC dans les cas de déséquilibre de classes, car il se concentre sur la précision (taux de vrais positifs sur les positifs prédits) et le rappel (capacité de capturer les vrais positifs).

Un PR de 1.0 indique également une précision et un rappel parfaits.

# Conclusion:
Random Forest et Gradient-Boosted Tree offrent les meilleures performances en termes de ROC et PR scores.

Si la simplicité et la rapidité sont des priorités, Logistic Regression ou Linear SVC peuvent être préférables malgré des scores légèrement inférieurs.

Random Forest et Gradient-Boosted Tree sont les modèles de choix si les performances maximales sont l'objectif, bien que Gradient-Boosted Tree puisse offrir une meilleure généralisation.