In [None]:
!pip install pyspark



In [None]:
import pandas as pd

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, StandardScaler, Imputer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.sql import SparkSession

# Téléchargement des données à partir du repo git
!wget "https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv"

DATA_PATH = "titanic.csv"
TARGET = "Survived"
CATEGORICAL_FEATURES = [
    "Sex",
    'Cabin',
    'Embarked',
    'Pclass'
]
NUMERICAL_FEATURES = [
    'Fare',
    'Age'
]

--2024-11-27 21:29:29--  https://raw.githubusercontent.com/datasciencedojo/datasets/refs/heads/master/titanic.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 60302 (59K) [text/plain]
Saving to: ‘titanic.csv.3’


2024-11-27 21:29:29 (5.33 MB/s) - ‘titanic.csv.3’ saved [60302/60302]



In [None]:
spark = SparkSession.builder.getOrCreate()

# 1 Load Data

In [None]:
df = spark.read.csv(path=DATA_PATH, sep=",", inferSchema=True, header=True)

In [None]:
df.limit(5).toPandas()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# 2 Split Data

In [None]:
# Split de la data
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

#3 Build Pipeline

In [None]:
# Etape 1: Gestion des données catégorielles
string_indexer = StringIndexer(
    inputCols=CATEGORICAL_FEATURES,
    outputCols=[f"{col}_indexed" for col in CATEGORICAL_FEATURES],
    handleInvalid='keep'
)
ohe_encoder = OneHotEncoder(
    inputCols=[f"{col}_indexed" for col in CATEGORICAL_FEATURES],
    outputCols=[f"{col}_ohe" for col in CATEGORICAL_FEATURES],
    handleInvalid='error',
    dropLast=True
)

In [None]:
# Etape 2: Gestion des données numériques
imputer = Imputer(
    strategy="mean",
    inputCols=NUMERICAL_FEATURES,
    outputCols=[f"{col}_imputed" for col in NUMERICAL_FEATURES]
)

imputed_num_features_vector = VectorAssembler(
    inputCols=[f"{col}_imputed" for col in NUMERICAL_FEATURES],
    outputCol="imputed_num_features"
)

scaler = StandardScaler(
    inputCol="imputed_num_features",
    outputCol="scaled_num_features",
    withMean=True,
    withStd=True
)

In [None]:
# Etape 3: Combiner les Features
assembler = VectorAssembler(
    inputCols= [f"{col}_ohe" for col in CATEGORICAL_FEATURES] + ["scaled_num_features"],
    outputCol="features"
)

In [None]:
# Etape 4: Définition du Modèle
rf = RandomForestClassifier(
    featuresCol="features",
    labelCol=TARGET,
    predictionCol="prediction",
    probabilityCol="proba"
)


In [None]:
# Etape 5: Tout dans un Pipeline
pipeline = Pipeline(
    stages=[
        string_indexer,
        ohe_encoder,
        imputer,
        imputed_num_features_vector,
        scaler,
        assembler,
        rf
    ]
)

# 4 CrossValidation and HyperParameters Tuning

In [None]:
# Etape 6 : Définition des hyperparamètres
paramGrid = (
    ParamGridBuilder()
    .addGrid(rf.numTrees, [10, 20])
    .addGrid(rf.maxDepth, [5,])
    .build()
)

# Etape 7 : Construction d'une crossvalidation
cross_validator = CrossValidator(
    estimator=pipeline,
    estimatorParamMaps=paramGrid,
    evaluator=MulticlassClassificationEvaluator(labelCol=TARGET, metricName="accuracy"),
    numFolds=2,
    seed=42
)

# 5 Train Model

In [None]:
# Etape 9 : Entrainement et recherche du meilleurs modèle
cv_model = cross_validator.fit(train_df.limit(100))

In [None]:
# Etape 10 : Récupération du meilleur modèle
best_rf_model = cv_model.bestModel

# 6 Evaluation

In [None]:
# Predictions
train_pred = best_rf_model.transform(train_df)
test_pred = best_rf_model.transform(test_df)

In [None]:
# Evaluation
eval_metrics = ["areaUnderROC", "f1", "accuracy" ,"precisionByLabel", "recallByLabel"]

for metric in eval_metrics :
  if metric == "areaUnderROC":
    evaluator = BinaryClassificationEvaluator(
      rawPredictionCol='proba',
      labelCol=TARGET,
      metricName='areaUnderROC'
    )
  else :
    evaluator = MulticlassClassificationEvaluator(
        predictionCol="prediction",
        labelCol=TARGET,
        metricName=metric
    )

  perf_train = evaluator.evaluate(train_pred)
  perf_test = evaluator.evaluate(test_pred)

  print(f"Train set {metric} = {perf_train:.2f}")
  print(f"Test set {metric} = {perf_test:.2f}")
  print("")

Train set areaUnderROC = 0.80
Test set areaUnderROC = 0.81

Train set f1 = 0.81
Test set f1 = 0.80

Train set accuracy = 0.82
Test set accuracy = 0.81

Train set precisionByLabel = 0.80
Test set precisionByLabel = 0.76

Train set recallByLabel = 0.94
Test set recallByLabel = 0.95



#8 Feature Importances

In [None]:
binary_pipeline_features = [v["name"] for v in train_pred.schema["features"].metadata["ml_attr"]["attrs"]["binary"]]
pipeline_features_name  = binary_pipeline_features + NUMERICAL_FEATURES

feature_importances = [v for v in best_rf_model.stages[-1].featureImportances]

df_importances = pd.DataFrame(
    feature_importances,
    index=pipeline_features_name,
    columns=["importance"]
).sort_values("importance")

df_importances.sort_values(by="importance", ascending=False)

Unnamed: 0,importance
Sex_ohe_female,0.306219
Sex_ohe_male,0.1468
Fare,0.082301
Age,0.074343
Pclass_ohe_2,0.061389
Cabin_ohe___unknown,0.057154
Embarked_ohe_Q,0.055111
Cabin_ohe_D10 D12,0.052662
Cabin_ohe_D56,0.050494
Pclass_ohe_3,0.027735


# 7 Save Model

In [None]:
best_rf_model.save("rf_model")

# 8 Load Model

In [None]:
from pyspark.ml import PipelineModel
loaded_model = PipelineModel.load("rf_model")