# Laboratoire 2 - KNN, NAIVE BAYES, DECISION TREE

In [41]:
import numpy as np
import random
import math
import csv
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import cv2
import pandas as pd
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import MinMaxScaler
start = "\033[1m"
end = "\033[0;0m"

In [2]:
# your code here to drop Cabin
#titanic.drop("Cabin",axis=1,inplace=True)
# function to get the data frame from csv
def read_csv(url):
    """
    Args:
        url (string): the url of the file
    Returns:
        df: the dataframe filled
    """
    df = pd.read_csv(url, header=None)
    df.head()
    return df

In [3]:
def Naive_Bayes(X_train,Y_train,X_test,Y_test) :
    """
    Args:
        X_train (list): the train sample
        X_test (list): the test sample
        Y_train (list): the train output
        Y_test (list): the test output
    Returns:
        clf: the naive bayes model
    """
    clf  = BernoulliNB()
    clf1 = GaussianNB()
    clf2 = MultinomialNB()
    clf.fit(X_train, Y_train)
    clf1.fit(X_train, Y_train)
    clf2.fit(X_train, Y_train)
    print("Accuracy with training set " + str(clf.score(X_test, Y_test)))
    y_true = Y_test
    y_pred = clf.predict(X_test)
    print("F1 score with training set " + str(f1_score(y_true, y_pred,average="weighted")))
    return clf

In [35]:
def trainingAccuracyAndFscoreKNN(X_train,X_test,y_train,y_test):
    accuracyArray = []
    fscoreArray = []
    best=0
    bestAcc=0
    for x in range(1,51):
        knn = KNeighborsClassifier(n_neighbors = x)
        knn.fit(X_train,y_train)
        #accuracy
        accuracyArray.append(knn.score(X_test,y_test))
        #f1 score
        y_true = y_test
        y_pred = knn.predict(X_test)
        fscoreArray.append(f1_score(y_true,y_pred,average="weighted"))
        if accuracyArray[x-1] >= bestAcc:
            best = x
    
    print("Accuracy with training set " + str(accuracyArray[best-1]))
    print("F1 score with training set " + str(fscoreArray[best-1]))
    knn = KNeighborsClassifier(n_neighbors = best)
    knn.fit(X_train,y_train)
    return knn, best

In [55]:
def trainingPretreatments(model,galaxy,X_train,X_test,y_train,y_test):
    col_names =  ['Accuracy', 'F1 score']
    Results = pd.DataFrame(columns = col_names)
    y_true = y_test
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    f1 = f1_score(y_true, y_pred,average="weighted")
    score = model.score(X_test, y_test)
    Results.loc["Aucun prétraitement"] = [score , f1]
    
    min_max_scaler = MinMaxScaler()
    X_train_minmax = min_max_scaler.fit_transform(X_train)
    X_test_minmax = min_max_scaler.fit_transform(X_test)
    model.fit(X_train_minmax,y_train)
    y_pred = model.predict(X_test_minmax)
    f1 = f1_score(y_true, y_pred,average="weighted")
    score = model.score(X_test_minmax, y_test)
    Results.loc["MinMaxScaler"] = [score , f1]
    
    if galaxy == False:
        enc = KBinsDiscretizer(n_bins=5,encode='ordinal')
        X_train_binned = enc.fit_transform(X_train)
        X_test_binned = enc.fit_transform(X_test)
        y_pred = model.predict(X_test_binned)
        model.fit(X_train_binned,y_train)
        f1 = f1_score(y_true, y_pred,average="weighted")
        score = model.score(X_test_binned, y_test)
        Results.loc["Discrétisation non-supervisée"] = [score , f1]
        
    print(Results)

In [5]:
from sklearn.model_selection import StratifiedKFold

def Cross_Validation(model,X_train,Y_train) :
    """
    Args:
        model (sklearn classifier): the model to fit
        X_train (list): the train sample
        X_test (list): the test sample
    Returns:
        best_model: the best trained model in the cross validation
    """
    col_names =  ['Accuracy', 'F1 score']
    Results = pd.DataFrame(columns = col_names)
    best_score = 0
    
    skf = StratifiedKFold(n_splits=10)
    skf.get_n_splits(X_train, Y_train) 

    for train_index, test_index in skf.split(X_train, Y_train):
        # split train test
        xtrain, xtest = X_train.iloc[train_index], X_train.iloc[test_index]
        ytrain, ytest = Y_train.iloc[train_index], Y_train.iloc[test_index]
        # fitting and score for this sample
        temp_model = model.fit(xtrain, ytrain)
        y_true = ytest
        y_pred = model.predict(xtest)
        f1 = f1_score(y_true, y_pred,average="weighted")
        score = model.score(xtest, ytest)
        Results = Results.append(pd.Series([score , f1], index=Results.columns ), ignore_index=True)
        #print("Accuracy with training set " + str(score) + "F1 score with training set " + str(f1))
        if(score > best_score):
            best_score = score
            best_model = temp_model
    print(Results)
    return(best_model)

In [6]:
def Verify_results(model,best_model,X,Y) :
    """
    Args:
        model (sklearn classifier): the model hyperparametered
        model (sklearn classifier): the model cross validated
        X (list): the test sample
        Y (list): the test sample
    Returns:
    """
    col_names =  ['Accuracy', 'F1 score']
    Results = pd.DataFrame(columns = col_names)
    y_true = Y
    
    # score for the first model
    y_pred = model.predict(X)
    f1 = f1_score(y_true, y_pred,average="weighted")
    score = model.score(X, Y)
    Results.loc["model hyperparametered"] = [score , f1]
    #Results = Results.append(pd.Series([score , f1], index="model hyperparametered" ), ignore_index=True)
    
    #score for the best model
    y_pred = best_model.predict(X)
    f1 = f1_score(y_true, y_pred,average="weighted")
    score = best_model.score(X, Y)
    Results.loc["model cross validated"] = [score , f1]
    #Results = Results.append(pd.Series([score , f1], index="model cross validated" ), ignore_index=True)
    print(Results)

In [33]:
# read the csv and get the dataframe
df = read_csv("spam.csv")
# get the train and test sample
X_train, X_test, Y_train, Y_test = train_test_split(df.loc[:, df.columns != 57], df[57], test_size=0.2,stratify=df[57])
# entraîner le modèle naive bayes
print(start + "Résultat du training modèle \n" + end)
naive = Naive_Bayes(X_train,Y_train,X_test,Y_test)

# faire la cross validation et récupérer le meilleur modèle train
"""
    créer un nouveau modèle avec le bon hyperparamètre 
    surtout pas passer la copie du modèle trainé déjà
"""
print("\n\n" + start + "Résulat de la cross validation \n" + end)
naive_cp = BernoulliNB()
best_cross_validated_model = Cross_Validation(naive_cp,X_train,Y_train)

# regarder différence entre modèle hyperpamètré et modèle mieux train par cross validation 
print("\n\n" + start + "Différence entre les deux modèles \n" + end)
Verify_results(naive,best_cross_validated_model,X_test,Y_test)

[1mRésultat du training modèle 
[0;0m
Accuracy with training set 0.8731884057971014
F1 score with training set 0.8719065175014832


[1mRésulat de la cross validation 
[0;0m
   Accuracy  F1 score
0  0.896396  0.894729
1  0.869369  0.868978
2  0.918919  0.917975
3  0.891892  0.891199
4  0.868182  0.867480
5  0.850000  0.848431
6  0.913636  0.913176
7  0.877273  0.874855
8  0.854545  0.853221
9  0.918182  0.917210


[1mDifférence entre les deux modèles 
[0;0m
                        Accuracy  F1 score
model hyperparametered  0.873188  0.871907
model cross validated   0.875000  0.873804


In [56]:
# read the csv and get the dataframe
df = read_csv("spam.csv")
# get the train and test sample
X_train, X_test, Y_train, Y_test = train_test_split(df.loc[:, df.columns != 57], df[57], test_size=0.2,stratify=df[57])
print(start + "Résultat du training modèle knn \n" + end)
knn_model,k = trainingAccuracyAndFscoreKNN(X_train,X_test,Y_train,Y_test)

print("\n\n" + start + "Différent prétraitements selon l'ensemble de donnnées knn \n" + end)
knn_cp = KNeighborsClassifier(n_neighbors=k)
trainingPretreatments(knn_cp,False,X_train,X_test,Y_train,Y_test)

# faire la cross validation et récupérer le meilleur modèle train
"""
    créer un nouveau modèle avec le bon hyperparamètre 
    surtout pas passer la copie du modèle trainé déjà
"""
print("\n\n" + start + "Résulat de la cross validation knn \n" + end)
knn_cp = KNeighborsClassifier(n_neighbors=k)
best_cross_validated_model_knn = Cross_Validation(knn_cp,X_train,Y_train)

# regarder différence entre modèle hyperpamètré et modèle mieux train par cross validation 
print("\n\n" + start + "Différence entre les deux modèles knn \n" + end)
Verify_results(knn_model,best_cross_validated_model_knn,X_test,Y_test)

[1mRésultat du training modèle knn 
[0;0m
Accuracy with training set 0.7336956521739131
F1 score with training set 0.734171715435531


[1mDifférent prétraitements selon l'ensemble de donnnées knn 
[0;0m


  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing the number of bins.' % jj)
  'decreasing th

                               Accuracy  F1 score
Aucun prétraitement            0.733696  0.734172
MinMaxScaler                   0.887681  0.888062
Discrétisation non-supervisée  0.869565  0.651469


[1mRésulat de la cross validation knn 
[0;0m
   Accuracy  F1 score
0  0.765766  0.766180
1  0.770270  0.770864
2  0.738739  0.739984
3  0.756757  0.753924
4  0.736364  0.731610
5  0.690909  0.689622
6  0.690909  0.689622
7  0.781818  0.781818
8  0.736364  0.734641
9  0.800000  0.798693


[1mDifférence entre les deux modèles knn 
[0;0m
                        Accuracy  F1 score
model hyperparametered  0.733696  0.734172
model cross validated   0.746377  0.746915
