In [96]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import svm, tree
import pandas as pd
import numpy as np
import xgboost as xgb


## Experiementation 1

In [97]:
df = pd.read_csv("result_water.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1102 entries, 0 to 1101
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      1102 non-null   int64  
 1   E.C             1102 non-null   int64  
 2   TDS             1102 non-null   float64
 3   Cl              1102 non-null   int64  
 4   NO3             1102 non-null   float64
 5   Na              1102 non-null   float64
 6   Ca              1102 non-null   float64
 7   Mg              1102 non-null   float64
 8   T.H             1102 non-null   float64
 9   SAR             1102 non-null   float64
 10  Classification  1102 non-null   int64  
 11  RSC  meq  / L   1102 non-null   float64
dtypes: float64(8), int64(4)
memory usage: 103.4 KB


In [98]:
def matrice_et_stats(y_test, y_prediction):
    evaluation = {}
    evaluation['accuracy'] = accuracy_score(y_test, y_prediction)
    evaluation['precision'] = precision_score(y_test, y_prediction, average='weighted')
    evaluation['rappel'] = recall_score(y_test, y_prediction, average='weighted')
    evaluation['f1_score'] = f1_score(y_test, y_prediction, average='weighted')
    return evaluation

In [99]:
def moyenne_dictionnaires(liste_dictionnaires):
    accuracy=0
    precision=0
    rappel=0
    f1_score=0

    for dict in liste_dictionnaires:
        accuracy+=dict['accuracy']
        precision+=dict['precision']
        rappel+=dict['rappel']
        f1_score+=dict['f1_score']
    return {'accuracy':100*accuracy/len(liste_dictionnaires), 'precision':100*precision/len(liste_dictionnaires), 'rappel':100*rappel/len(liste_dictionnaires), 'f1_score':100*f1_score/len(liste_dictionnaires)}
        

## Split des données

In [100]:
df['Classification_numerique'] = LabelEncoder().fit_transform(df['Classification'])
X = df.drop(['Classification', 'Classification_numerique'], axis=1)
y = df['Classification_numerique']
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

knn_evaluation = []
pknn_evaluation = []
svm_evaluation = []
psvm_evaluation = []
tree_evaluation = []
ptree_evaluation = []
rf_evaluation = []
prf_evaluation = []
xgb_evaluation = []
pxgb_evaluation = []
nn_evaluation = []
pnn_evaluation = []

for train_index, test_index in kfold.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train=scaler.transform(X_train)
    X_test=scaler.transform(X_test)
    
    knn_model = KNeighborsClassifier()
    knn_model.fit(X_train, y_train)
    y_prediction = knn_model.predict(X_test)
    knn_evaluation.append(matrice_et_stats(y_test, y_prediction))

    knn_model = KNeighborsClassifier(n_neighbors=9, weights='distance')
    knn_model.fit(X_train, y_train)
    y_prediction = knn_model.predict(X_test)
    pknn_evaluation.append(matrice_et_stats(y_test, y_prediction))

    svm_model = svm.SVC()
    svm_model.fit(X_train, y_train)
    y_prediction = svm_model.predict(X_test)
    svm_evaluation.append(matrice_et_stats(y_test, y_prediction))

    svm_model = svm.SVC(C=1, kernel='linear')
    svm_model.fit(X_train, y_train)
    y_prediction = svm_model.predict(X_test)
    psvm_evaluation.append(matrice_et_stats(y_test, y_prediction))

    tree_model = tree.DecisionTreeClassifier()
    tree_model.fit(X_train, y_train)
    y_prediction = tree_model.predict(X_test)
    tree_evaluation.append(matrice_et_stats(y_test, y_prediction))

    tree_model = tree.DecisionTreeClassifier(criterion='log_loss', splitter='random', max_depth=200)
    tree_model.fit(X_train, y_train)
    y_prediction = tree_model.predict(X_test)
    ptree_evaluation.append(matrice_et_stats(y_test, y_prediction))

    rf_model = RandomForestClassifier()
    rf_model.fit(X_train, y_train)
    y_prediction = rf_model.predict(X_test)
    y_prediction = np.round(y_prediction)
    rf_evaluation.append(matrice_et_stats(y_test, y_prediction))

    rf_model = RandomForestClassifier(criterion='entropy', max_depth=200, n_estimators=100, max_features='log2')
    rf_model.fit(X_train, y_train)
    y_prediction = rf_model.predict(X_test)
    y_prediction = np.round(y_prediction)
    prf_evaluation.append(matrice_et_stats(y_test, y_prediction))

    xgb_model = xgb.XGBClassifier()
    xgb_model.fit(X_train, y_train)
    y_prediction = xgb_model.predict(X_test)
    y_prediction = np.round(y_prediction)
    xgb_evaluation.append(matrice_et_stats(y_test, y_prediction))

    xgb_model = xgb.XGBClassifier(learning_rate=0.5, max_depth=200, n_estimators=100, verbosity=0)
    xgb_model.fit(X_train, y_train)
    y_prediction = xgb_model.predict(X_test)
    y_prediction = np.round(y_prediction)
    pxgb_evaluation.append(matrice_et_stats(y_test, y_prediction))
    
    nn_model = MLPClassifier(max_iter=1000)
    nn_model.fit(X_train, y_train)
    y_prediction = nn_model.predict(X_test)
    nn_evaluation.append(matrice_et_stats(y_test, y_prediction))

knn_evaluation=moyenne_dictionnaires(knn_evaluation)
pknn_evaluation=moyenne_dictionnaires(pknn_evaluation)
svm_evaluation=moyenne_dictionnaires(svm_evaluation)
psvm_evaluation=moyenne_dictionnaires(psvm_evaluation)
tree_evaluation=moyenne_dictionnaires(tree_evaluation)
ptree_evaluation=moyenne_dictionnaires(ptree_evaluation)
rf_evaluation=moyenne_dictionnaires(rf_evaluation)
prf_evaluation=moyenne_dictionnaires(prf_evaluation)
xgb_evaluation=moyenne_dictionnaires(xgb_evaluation)
pxgb_evaluation=moyenne_dictionnaires(pxgb_evaluation)
nn_evaluation=moyenne_dictionnaires(nn_evaluation)


evaluation = pd.DataFrame([knn_evaluation, pknn_evaluation, svm_evaluation, psvm_evaluation, tree_evaluation, ptree_evaluation, rf_evaluation, prf_evaluation, xgb_evaluation, pxgb_evaluation, nn_evaluation]).round(2)
evaluation.index = ['KNN', 'KNN2', 'SVM', 'SVM2', 'Decision Tree', 'Decision Tree 2', 'Random Forest', 'Random Forest 2', 'XGBoost', 'XGBoost 2', 'Réseau de neuronnes']
evaluation



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,accuracy,precision,rappel,f1_score
KNN,94.1,93.98,94.1,93.98
KNN2,93.56,93.46,93.56,93.37
SVM,95.83,95.62,95.83,95.69
SVM2,97.46,97.24,97.46,97.33
Decision Tree,99.36,99.28,99.36,99.32
Decision Tree 2,98.55,98.5,98.55,98.51
Random Forest,99.27,99.1,99.27,99.18
Random Forest 2,99.27,99.1,99.27,99.18
XGBoost,99.46,99.37,99.46,99.41
XGBoost 2,99.46,99.37,99.46,99.41
