# Réaffectation des individus

Cette section vise à explorer une nouvelle approche de réaffectation des individus au sein des groupes en s'appuyant sur des variables illustratives. Notre démarche se déroulera en deux temps : nous débuterons par l'affectation de la segmentation variable, représentée en vert, en nous basant sur les variables identifiées en orange. Par la suite, nous approfondirons l'analyse en utilisant un ensemble spécifique de variables qui sera présenté en détail dans une section distincte.

## Setup

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

from pandas.core.frame import DataFrame
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, train_test_split 
from sklearn.metrics import accuracy_score, classification_report

## Classification en utilisant les variables oranges

In [2]:
green: DataFrame = pd.read_csv(filepath_or_buffer="../data/vert.csv")
green

Unnamed: 0,cle,Respondent_ID,weight,A11,A12,A13,A14,A4,A5,A5bis,...,C1_1_slice,C1_2_slice,C1_3_slice,C1_4_slice,C1_5_slice,C1_6_slice,C1_7_slice,C1_8_slice,C1_9_slice,cluster
0,1,MET20_999999996,2.501255,1,0,0,0,1,2.0,0.0,...,2,2,2,2,2,2,2,2,2,3
1,2,MET20_98888888,0.722914,1,0,0,0,1,5.0,0.0,...,4,4,4,4,4,4,4,4,4,1
2,3,MET20_1978307,1.039611,1,0,0,0,1,2.0,0.0,...,4,4,4,4,4,4,4,4,4,1
3,4,MET20_1302078,0.976590,1,1,1,0,1,1.0,0.0,...,4,4,5,5,5,5,5,4,4,4
4,5,MET20_1869308,0.812315,0,1,0,0,2,0.0,1.0,...,4,4,4,4,4,4,4,4,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,MET20_779605,0.905997,1,0,1,0,3,0.0,1.0,...,4,4,5,5,5,5,5,4,5,2
4996,4997,MET20_80000348,1.257884,0,1,0,0,2,0.0,1.0,...,4,5,5,5,5,5,5,5,5,2
4997,4998,MET20_288029,1.094695,0,1,0,0,2,0.0,2.0,...,5,5,5,5,5,5,5,5,5,2
4998,4999,MET20_1235808,2.022023,0,0,1,0,3,0.0,1.0,...,5,4,5,5,5,5,5,5,5,2


In [3]:
orange: DataFrame = pd.read_csv(filepath_or_buffer="../data/orange.csv")
orange

Unnamed: 0,cle,Respondent_ID,weight,A9_1_slice,A9_2_slice,A9_3_slice,A9_4_slice,A9_5_slice,A9_6_slice,A9_7_slice,...,A11_5_slice,A11_6_slice,A11_7_slice,A11_8_slice,A11_9_slice,A11_10_slice,A11_11_slice,A11_12_slice,A11_13_slice,cluster
0,1,MET20_999999996,2.501255,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,0
1,2,MET20_98888888,0.722914,1,1,1,1,1,1,1,...,2,2,2,2,2,2,2,2,2,3
2,3,MET20_1978307,1.039611,3,2,2,2,3,2,2,...,2,2,2,2,2,2,2,2,2,0
3,4,MET20_1302078,0.976590,1,2,2,2,3,2,3,...,1,2,3,3,3,2,2,1,2,0
4,5,MET20_1869308,0.812315,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,MET20_779605,0.905997,3,2,3,4,3,2,2,...,1,2,1,1,1,1,1,1,1,3
4996,4997,MET20_80000348,1.257884,2,3,4,3,3,2,4,...,2,3,2,2,2,2,3,2,4,1
4997,4998,MET20_288029,1.094695,3,4,4,3,2,2,4,...,2,3,3,3,2,3,3,2,3,1
4998,4999,MET20_1235808,2.022023,3,3,4,3,3,2,3,...,3,2,3,2,3,2,2,1,3,1


In [4]:
data: DataFrame = pd.merge(left=orange.iloc[:, :-1], right=green[["cluster", "Respondent_ID"]], how="right", left_on="Respondent_ID", right_on="Respondent_ID") # Ici on ne prend pas la variable cluster de orange car on veut la réaffecter
data = data.drop(columns=["Respondent_ID"])
data

Unnamed: 0,cle,weight,A9_1_slice,A9_2_slice,A9_3_slice,A9_4_slice,A9_5_slice,A9_6_slice,A9_7_slice,A9_8_slice,...,A11_5_slice,A11_6_slice,A11_7_slice,A11_8_slice,A11_9_slice,A11_10_slice,A11_11_slice,A11_12_slice,A11_13_slice,cluster
0,1,2.501255,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,3
1,2,0.722914,1,1,1,1,1,1,1,1,...,2,2,2,2,2,2,2,2,2,1
2,3,1.039611,3,2,2,2,3,2,2,2,...,2,2,2,2,2,2,2,2,2,1
3,4,0.976590,1,2,2,2,3,2,3,2,...,1,2,3,3,3,2,2,1,2,4
4,5,0.812315,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4996,0.905997,3,2,3,4,3,2,2,2,...,1,2,1,1,1,1,1,1,1,2
4996,4997,1.257884,2,3,4,3,3,2,4,2,...,2,3,2,2,2,2,3,2,4,2
4997,4998,1.094695,3,4,4,3,2,2,4,4,...,2,3,3,3,2,3,3,2,3,2
4998,4999,2.022023,3,3,4,3,3,2,3,3,...,3,2,3,2,3,2,2,1,3,2


In [5]:
data: DataFrame = data.fillna(value=-1)

In [6]:
pca = PCA(n_components=3)
features_pca = pca.fit_transform(data.values)
pca_df = pd.DataFrame(
    features_pca,
    columns=['PC1', 'PC2', 'PC3']
)
pca_df['Cluster'] = data['cluster'].values

colors = ['#00CED1', '#FFA500', '#32CD32', '#FF4500', '#9370DB']

fig = go.Figure()

for cluster_num in range(5):
    cluster_data = pca_df[pca_df['Cluster'] == cluster_num]
    
    fig.add_trace(go.Scatter3d(
        x=cluster_data['PC1'],
        y=cluster_data['PC2'],
        z=cluster_data['PC3'],
        mode='markers',
        name=f'Cluster {cluster_num}',
        marker=dict(
            size=4,
            color=colors[cluster_num],
            opacity=0.7,
            line=dict(width=0.5, color='white')
        ),
        hovertemplate=(
            'PC1: %{x:.2f}<br>' +
            'PC2: %{y:.2f}<br>' +
            'PC3: %{z:.2f}<br>' +
            '<extra></extra>'
        )
    ))

fig.update_layout(
    title=dict(
        text='Visualisation 3D interactive des clusters par PCA',
        font=dict(size=24),
        x=0.5,
        y=0.95
    ),
    scene=dict(
        xaxis_title='Première composante principale',
        yaxis_title='Deuxième composante principale',
        zaxis_title='Troisième composante principale',
        xaxis=dict(gridcolor='rgb(255, 255, 255)', gridwidth=1, zeroline=False),
        yaxis=dict(gridcolor='rgb(255, 255, 255)', gridwidth=1, zeroline=False),
        zaxis=dict(gridcolor='rgb(255, 255, 255)', gridwidth=1, zeroline=False),
        bgcolor='rgb(240, 240, 240)'
    ),
    paper_bgcolor='white',
    plot_bgcolor='white',
    showlegend=True,
    legend=dict(
        title=dict(text='Groupes'),
        itemsizing='constant',
        bgcolor='rgba(255, 255, 255, 0.9)',
        bordercolor='rgba(0, 0, 0, 0.2)',
        borderwidth=1
    ),
    margin=dict(l=0, r=0, t=50, b=0)
)

fig.add_annotation(
    text="Cliquez et faites glisser pour faire pivoter",
    xref="paper", yref="paper",
    x=0, y=1.1,
    showarrow=False,
    font=dict(size=12, color="gray")
)

fig.show()

## Initialisation du modèle

Compte tenu des excellents résultats obtenus par l'algorithme Random Forest lors de la Partie 2, nous avons choisi de l'employer en premier lieu pour aborder cette tâche de classification. Notre objectif est d'exploiter ses capacités pour atteindre un score de classification optimal.

In [7]:
X: DataFrame = data.drop(columns=['cluster'])
y = data['cluster']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=23)

3500 1500


In [8]:
n_estimators = 1000
rf_model = RandomForestClassifier(n_estimators=n_estimators, random_state=23)
rf_model.fit(X_train, y_train)

In [9]:
y_pred = rf_model.predict(X=X_test)

In [10]:
print("Accuracy:", accuracy_score(y_true=y_test, y_pred=y_pred))
print(classification_report(y_true=y_test, y_pred=y_pred))

Accuracy: 0.5006666666666667
              precision    recall  f1-score   support

           0       1.00      0.05      0.09       122
           1       0.44      0.60      0.51       367
           2       0.68      0.26      0.38       381
           3       0.44      0.27      0.33        89
           4       0.50      0.74      0.60       541

    accuracy                           0.50      1500
   macro avg       0.61      0.38      0.38      1500
weighted avg       0.57      0.50      0.46      1500



On se rend compte que le RF associé à l'hyperparamètre utilisé lors de la Q2 n'est pas très efficace. En effet, le modèle n'obtient qu'un score de 50% pour l'accuracy. On va donc tenter d'optimiser l'hyperparamètre en utilisant un GridSearchCV.

In [11]:
param_grid = {
    'n_estimators': list(range(1100, 2100, 100)),
    'max_depth': list(range(10, 60, 10)),
}

In [12]:
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=2,
    verbose=3,
    n_jobs=1,
)
grid_search.fit(X_train, y_train)
print("Best hyperparamètres :", grid_search.best_params_)


Fitting 2 folds for each of 50 candidates, totalling 100 fits
[CV 1/2] END ...max_depth=10, n_estimators=1100;, score=0.523 total time=   2.2s
[CV 2/2] END ...max_depth=10, n_estimators=1100;, score=0.535 total time=   2.1s
[CV 1/2] END ...max_depth=10, n_estimators=1200;, score=0.521 total time=   2.4s
[CV 2/2] END ...max_depth=10, n_estimators=1200;, score=0.533 total time=   2.5s
[CV 1/2] END ...max_depth=10, n_estimators=1300;, score=0.521 total time=   2.6s
[CV 2/2] END ...max_depth=10, n_estimators=1300;, score=0.533 total time=   2.6s
[CV 1/2] END ...max_depth=10, n_estimators=1400;, score=0.522 total time=   2.8s
[CV 2/2] END ...max_depth=10, n_estimators=1400;, score=0.534 total time=   2.7s
[CV 1/2] END ...max_depth=10, n_estimators=1500;, score=0.521 total time=   3.1s
[CV 2/2] END ...max_depth=10, n_estimators=1500;, score=0.534 total time=   3.0s
[CV 1/2] END ...max_depth=10, n_estimators=1600;, score=0.521 total time=   3.3s
[CV 2/2] END ...max_depth=10, n_estimators=1600

In [13]:
best_rf_model: RandomForestClassifier = grid_search.best_estimator_
test_score = best_rf_model.score(X_test, y_test)
print(f"Score du modèle optimisé sur les données de test : {test_score:.2%}")

Score du modèle optimisé sur les données de test : 49.80%


Nous remarquons que malgré les optimisations, le modèle ne produit pas de meilleurs résultats et reste aux alentours de 50% d'accuracy.

## Classification en utilisant les variables spécifiques

Dans cette partie nous affecterons les individus dans les groupes à l'aide du jeu de variables suivant:
rs3 rs5 rs6 RS1 RS191 RS192 RS193 
RS102RECAP rs11recap2 RS11recap RS193bis RS2Recap RS56Recap RS2 RS11 
RS102

In [14]:
variables: DataFrame = pd.read_csv(filepath_or_buffer="../data/fic_epita_kantar_codes.csv", sep=";")
variables = variables[[
    "Respondent_ID", 
    "rs3", 
    "rs5", 
    "rs6", 
    "RS1", 
    "RS191", 
    "RS192", 
    "RS193", 
    "RS102RECAP", 
    "rs11recap2", 
    "RS11recap", 
    "RS193bis", 
    "RS2Recap", 
    "RS56Recap", 
    "RS2", 
    "RS11", 
    "RS102"
]]
variables

Unnamed: 0,Respondent_ID,rs3,rs5,rs6,RS1,RS191,RS192,RS193,RS102RECAP,rs11recap2,RS11recap,RS193bis,RS2Recap,RS56Recap,RS2,RS11,RS102
0,MET20_999999996,1,1,5.0,1,1.0,2,2,4,1,2,,1,1,24,0,4
1,MET20_98888888,1,1,2.0,1,1.0,2,2,1,1,2,,4,1,50,0,1
2,MET20_1978307,1,1,9.0,2,1.0,1,2,3,2,1,,3,2,37,1,3
3,MET20_1302078,2,3,,2,1.0,2,2,2,1,2,,5,3,63,0,2
4,MET20_1869308,1,1,6.0,1,2.0,2,2,3,2,1,,3,1,44,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,MET20_779605,1,3,,2,1.0,2,2,2,1,2,,6,3,69,0,2
4996,MET20_80000348,1,7,,2,2.0,1,2,1,1,2,,4,3,53,0,1
4997,MET20_288029,1,3,,1,2.0,2,1,2,1,2,2.0,6,3,75,0,2
4998,MET20_1235808,1,5,,2,1.0,1,2,3,2,1,,2,3,33,1,3


In [15]:
variables: DataFrame = variables.fillna(value=-1)

In [16]:
data: DataFrame = pd.merge(left=variables, right=green[["cluster", "Respondent_ID"]], left_on="Respondent_ID", right_on="Respondent_ID", how="right")
data = data.drop(columns=["Respondent_ID"])
data

Unnamed: 0,rs3,rs5,rs6,RS1,RS191,RS192,RS193,RS102RECAP,rs11recap2,RS11recap,RS193bis,RS2Recap,RS56Recap,RS2,RS11,RS102,cluster
0,1,1,5.0,1,1.0,2,2,4,1,2,-1.0,1,1,24,0,4,3
1,1,1,2.0,1,1.0,2,2,1,1,2,-1.0,4,1,50,0,1,1
2,1,1,9.0,2,1.0,1,2,3,2,1,-1.0,3,2,37,1,3,1
3,2,3,-1.0,2,1.0,2,2,2,1,2,-1.0,5,3,63,0,2,4
4,1,1,6.0,1,2.0,2,2,3,2,1,-1.0,3,1,44,1,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1,3,-1.0,2,1.0,2,2,2,1,2,-1.0,6,3,69,0,2,2
4996,1,7,-1.0,2,2.0,1,2,1,1,2,-1.0,4,3,53,0,1,2
4997,1,3,-1.0,1,2.0,2,1,2,1,2,2.0,6,3,75,0,2,2
4998,1,5,-1.0,2,1.0,1,2,3,2,1,-1.0,2,3,33,1,3,2


In [17]:
# Ici on sait que la colonne RS2 (l'âge de la personne qui remplit le sondage) représente une valeur non catégorique, on peut standardiser cette colonne.
rs2_mean = np.mean(data["RS2"])
rs2_std = np.std(data["RS2"])
data["RS2"] = (data["RS2"] - rs2_mean) / rs2_std #type: ignore
data

Unnamed: 0,rs3,rs5,rs6,RS1,RS191,RS192,RS193,RS102RECAP,rs11recap2,RS11recap,RS193bis,RS2Recap,RS56Recap,RS2,RS11,RS102,cluster
0,1,1,5.0,1,1.0,2,2,4,1,2,-1.0,1,1,-1.950843,0,4,3
1,1,1,2.0,1,1.0,2,2,1,1,2,-1.0,4,1,-0.225608,0,1,1
2,1,1,9.0,2,1.0,1,2,3,2,1,-1.0,3,2,-1.088225,1,3,1
3,2,3,-1.0,2,1.0,2,2,2,1,2,-1.0,5,3,0.637010,0,2,4
4,1,1,6.0,1,2.0,2,2,3,2,1,-1.0,3,1,-0.623739,1,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1,3,-1.0,2,1.0,2,2,2,1,2,-1.0,6,3,1.035141,0,2,2
4996,1,7,-1.0,2,2.0,1,2,1,1,2,-1.0,4,3,-0.026542,0,1,2
4997,1,3,-1.0,1,2.0,2,1,2,1,2,2.0,6,3,1.433272,0,2,2
4998,1,5,-1.0,2,1.0,1,2,3,2,1,-1.0,2,3,-1.353646,1,3,2


In [18]:
pca = PCA(n_components=3)
features_pca = pca.fit_transform(data.values)
pca_df = pd.DataFrame(
    features_pca,
    columns=['PC1', 'PC2', 'PC3']
)
pca_df['Cluster'] = data['cluster'].values
colors = ['#00CED1', '#FFA500', '#32CD32', '#FF4500', '#9370DB']
fig = go.Figure()
for cluster_num in range(5):
    cluster_data = pca_df[pca_df['Cluster'] == cluster_num]
    
    fig.add_trace(go.Scatter3d(
        x=cluster_data['PC1'],
        y=cluster_data['PC2'],
        z=cluster_data['PC3'],
        mode='markers',
        name=f'Cluster {cluster_num}',
        marker=dict(
            size=4,
            color=colors[cluster_num],
            opacity=0.7,
            line=dict(width=0.5, color='white')
        ),
        hovertemplate=(
            'PC1: %{x:.2f}<br>' +
            'PC2: %{y:.2f}<br>' +
            'PC3: %{z:.2f}<br>' +
            '<extra></extra>'
        )
    ))

fig.update_layout(
    title=dict(
        text='Visualisation 3D interactive des clusters par PCA',
        font=dict(size=24),
        x=0.5,
        y=0.95
    ),
    scene=dict(
        xaxis_title='Première composante principale',
        yaxis_title='Deuxième composante principale',
        zaxis_title='Troisième composante principale',
        xaxis=dict(gridcolor='rgb(255, 255, 255)', gridwidth=1, zeroline=False),
        yaxis=dict(gridcolor='rgb(255, 255, 255)', gridwidth=1, zeroline=False),
        zaxis=dict(gridcolor='rgb(255, 255, 255)', gridwidth=1, zeroline=False),
        bgcolor='rgb(240, 240, 240)'
    ),
    paper_bgcolor='white',
    plot_bgcolor='white',
    showlegend=True,
    legend=dict(
        title=dict(text='Groupes'),
        itemsizing='constant',
        bgcolor='rgba(255, 255, 255, 0.9)',
        bordercolor='rgba(0, 0, 0, 0.2)',
        borderwidth=1
    ),
    margin=dict(l=0, r=0, t=50, b=0)
)

fig.add_annotation(
    text="Cliquez et faites glisser pour faire pivoter",
    xref="paper", yref="paper",
    x=0, y=1.1,
    showarrow=False,
    font=dict(size=12, color="gray")
)

fig.show()

In [None]:
X: DataFrame = data.drop(columns=['cluster'])
y = data['cluster']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=23)

In [20]:
n_estimators = 1000
rf_model = RandomForestClassifier(n_estimators=n_estimators, random_state=23)
rf_model.fit(X_train, y_train)

In [21]:
y_pred = rf_model.predict(X=X_test)

In [22]:
print("Accuracy:", accuracy_score(y_true=y_test, y_pred=y_pred))
print(classification_report(y_true=y_test, y_pred=y_pred))

Accuracy: 0.44333333333333336
              precision    recall  f1-score   support

           0       0.23      0.12      0.16       122
           1       0.28      0.24      0.26       367
           2       0.65      0.65      0.65       381
           3       0.18      0.11      0.14        89
           4       0.44      0.56      0.50       541

    accuracy                           0.44      1500
   macro avg       0.36      0.34      0.34      1500
weighted avg       0.42      0.44      0.43      1500



On remarque que l'accuracy est encore faible. Nous allons donc réutiliser un GridSearchCV pour optimiser les hyperparamètres du modèle.

In [23]:
param_grid = {
    'n_estimators': list(range(1100, 2100, 100)),
    'max_depth': list(range(10, 60, 10)),
}

In [24]:
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=2,
    verbose=3,
    n_jobs=1,
)
grid_search.fit(X=X_train, y=y_train)

print("Best hyperparameters :", grid_search.best_params_)

Fitting 2 folds for each of 50 candidates, totalling 100 fits
[CV 1/2] END ...max_depth=10, n_estimators=1100;, score=0.502 total time=   1.7s
[CV 2/2] END ...max_depth=10, n_estimators=1100;, score=0.489 total time=   1.6s
[CV 1/2] END ...max_depth=10, n_estimators=1200;, score=0.502 total time=   1.7s
[CV 2/2] END ...max_depth=10, n_estimators=1200;, score=0.489 total time=   1.7s
[CV 1/2] END ...max_depth=10, n_estimators=1300;, score=0.502 total time=   1.8s
[CV 2/2] END ...max_depth=10, n_estimators=1300;, score=0.490 total time=   1.8s
[CV 1/2] END ...max_depth=10, n_estimators=1400;, score=0.502 total time=   2.0s
[CV 2/2] END ...max_depth=10, n_estimators=1400;, score=0.490 total time=   2.0s
[CV 1/2] END ...max_depth=10, n_estimators=1500;, score=0.501 total time=   2.2s
[CV 2/2] END ...max_depth=10, n_estimators=1500;, score=0.489 total time=   2.2s
[CV 1/2] END ...max_depth=10, n_estimators=1600;, score=0.500 total time=   2.4s
[CV 2/2] END ...max_depth=10, n_estimators=1600

In [25]:
best_rf_model: RandomForestClassifier = grid_search.best_estimator_

test_score = best_rf_model.score(X_test, y_test)
print(f"Score du modèle optimisé sur les données de test : {test_score:.2%}")


Score du modèle optimisé sur les données de test : 50.87%


On remarque une légère amélioration  de l'accuracy de 6% environ pour une valeur finale autour des 50%. Ce score n'étant pas satisfaisant, nous allons essayer d'autres algorithmes de classification tel que le KNN.

In [26]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

In [27]:
y_pred = rf_model.predict(X=X_test)
print("Accuracy:", accuracy_score(y_true=y_test, y_pred=y_pred))
print(classification_report(y_true=y_test, y_pred=y_pred))

Accuracy: 0.44333333333333336
              precision    recall  f1-score   support

           0       0.23      0.12      0.16       122
           1       0.28      0.24      0.26       367
           2       0.65      0.65      0.65       381
           3       0.18      0.11      0.14        89
           4       0.44      0.56      0.50       541

    accuracy                           0.44      1500
   macro avg       0.36      0.34      0.34      1500
weighted avg       0.42      0.44      0.43      1500



In [28]:
param_grid = {
    'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski'],
}

In [29]:
grid_search = GridSearchCV(
    estimator=knn,
    param_grid=param_grid,
    scoring='accuracy',
    cv=2,
    verbose=3,
    n_jobs=1,
)
grid_search.fit(X=X_train, y=y_train)

Fitting 2 folds for each of 48 candidates, totalling 96 fits
[CV 1/2] END metric=euclidean, n_neighbors=1, weights=uniform;, score=0.372 total time=   0.1s
[CV 2/2] END metric=euclidean, n_neighbors=1, weights=uniform;, score=0.368 total time=   0.0s
[CV 1/2] END metric=euclidean, n_neighbors=1, weights=distance;, score=0.372 total time=   0.0s
[CV 2/2] END metric=euclidean, n_neighbors=1, weights=distance;, score=0.368 total time=   0.0s
[CV 1/2] END metric=euclidean, n_neighbors=3, weights=uniform;, score=0.384 total time=   0.0s
[CV 2/2] END metric=euclidean, n_neighbors=3, weights=uniform;, score=0.371 total time=   0.0s
[CV 1/2] END metric=euclidean, n_neighbors=3, weights=distance;, score=0.379 total time=   0.0s
[CV 2/2] END metric=euclidean, n_neighbors=3, weights=distance;, score=0.374 total time=   0.0s
[CV 1/2] END metric=euclidean, n_neighbors=5, weights=uniform;, score=0.412 total time=   0.0s
[CV 2/2] END metric=euclidean, n_neighbors=5, weights=uniform;, score=0.423 tota

In [30]:
best_model: KNeighborsClassifier = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Meilleurs paramètres:", grid_search.best_params_)
print("Meilleur score:", grid_search.best_score_)
print("Accuracy sur l'ensemble de test:", accuracy)


Meilleurs paramètres: {'metric': 'euclidean', 'n_neighbors': 15, 'weights': 'uniform'}
Meilleur score: 0.4491428571428572
Accuracy sur l'ensemble de test: 0.452


Nous remarquons que le KNN n'est pas très efficace pour notre problème de classification. Nous pouvons donc garder le Random Forest comme modèle de classification.