# Import des données

In [1]:
# import libs
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn import discriminant_analysis
from sklearn.preprocessing import LabelEncoder
from pingouin import multivariate_normality
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt
import pandas as pd
import numpy as np

In [2]:
# loading the data from the CSV file
df = pd.read_csv("./data/data.csv")

# drop id column
df.drop(df.columns[[0]], axis=1, inplace=True)

# change classes names into integer
species = df["species"].to_numpy()
le = LabelEncoder().fit(df["species"])
df["species"] = le.transform(df["species"])

# get training/test data from df
dfTrain, dfTest = train_test_split(
    df, train_size=0.7, stratify=df["species"], random_state=1, shuffle=True
)

# convert to numpy
XTrain = dfTrain[dfTrain.columns[1:]].to_numpy()
YTrain = dfTrain[dfTrain.columns[0]].to_numpy()
XTest = dfTest[dfTest.columns[1:]].to_numpy()
YTest = dfTest[dfTest.columns[0]].to_numpy()

# Check if the train data follows a multivariate normal distribution
HZTest = multivariate_normality(XTrain)
print("Henze-Zirkler multivariate normality test : {}".format(HZTest.normal))

Henze-Zirkler multivariate normality test : False


In [3]:
skf = StratifiedKFold(shuffle=True, random_state=1)
accScore = list()

LDA = discriminant_analysis.LinearDiscriminantAnalysis()

for trainIdx, testIdx in skf.split(XTrain, YTrain):
    XTrainCV, XTestCV = XTrain[trainIdx], XTrain[testIdx]
    YTrainCV, YTestCV = YTrain[trainIdx], YTrain[testIdx]

    LDA.fit(XTrainCV, YTrainCV)
    predValues = LDA.predict(XTestCV)

    acc = accuracy_score(predValues, YTestCV)
    accScore.append(acc)

print("Average accuracy: ", np.mean(accScore))

Average accuracy:  0.950954019393181


In [5]:
QDA = discriminant_analysis.QuadraticDiscriminantAnalysis()

QDA.fit(XTrain, YTrain)
predValues = QDA.predict(XTest)

accuracy_score(predValues, YTest)



0.037037037037037035